def rd_blocklist(sites, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01'): """ Wrapper function to extract info about when data has changed between modification dates. Parameters ---------- sites : list, array, one column csv file, or dataframe Site numbers. datasource : list of str Hydstra datasource code (usually ['A']). variables : list of int or float The hydstra conversion data variable (140.00 is flow). start : str The start time in the format of '2001-01-01'. end : str Same formatting as start. start_modified: str The starting date of the modification. end_modified: str The ending date of the modification. Returns ------- DataFrame With site, data_source, varto, from_mod_date, and to_mod_date. """ ### Process sites sites1 = select_sites(sites).tolist() ### Open connection hyd = openHyDb() with hyd as h: df = h.get_ts_blockinfo(sites1, start=start, end=end, datasources=datasources, variables=variables, start_modified=start_modified, end_modified=end_modified) return df
def hydstra_site_mod_time(sites=None): """ Function to extract modification times from Hydstra data archive files. Returns a DataFrame of sites by modification date. The modification date is in GMT. Parameters ---------- sites : list, array, Series, or None If sites is not None, then return only the given sites. Returns ------- DataFrame """ site_files_path = r'\\fileservices02\ManagedShares\Data\Hydstra\prod\hyd\dat\hyd' files1 = rd_dir(site_files_path, 'A') file_sites = [path.splitext(i)[0] for i in files1] if sites is not None: sites1 = select_sites(sites).astype(str) sites2 = [i.replace('/', '_') for i in sites1] file_sites1 = [i for i in file_sites if i in sites2] else: file_sites1 = file_sites mod_times = to_datetime([round(path.getmtime(path.join(site_files_path, i + '.A'))) for i in file_sites1], unit='s') df = DataFrame({'site': file_sites1, 'mod_time': mod_times}) return df
def get_ts_traces(self, site_list, start=0, end=0, varfrom=100, varto=140, interval='day', multiplier=1, datasource='A', data_type='mean', qual_codes=[30, 20, 10, 11, 21, 18], report_time=None): """ """ # Convert the site list to a comma delimited string of sites sites = select_sites(site_list).astype(str) site_list_str = ','.join([str(site) for site in sites]) ### Datetime conversion - with dates < 1900 c1900 = Timestamp('1900-01-01') if start != 0: start1 = Timestamp(start) if start1 > c1900: start = start1.strftime('%Y%m%d%H%M%S') else: start = start1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '') if end != 0: end1 = Timestamp(end) if end1 > c1900: end = end1.strftime('%Y%m%d%H%M%S') else: end = end1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '') ts_traces_request = {'function': 'get_ts_traces', 'version': 2, 'params': {'site_list': site_list_str, 'start_time': start, 'end_time': end, 'varfrom': varfrom, 'varto': varto, 'interval': interval, 'datasource': datasource, 'data_type': data_type, 'multiplier': multiplier, 'report_time': report_time}} ts_traces_request = self.query_by_dict(ts_traces_request) j1 = ts_traces_request['return']['traces'] ### Convert json to a dataframe sites = [str(f['site']) for f in j1] out1 = DataFrame() for i in range(len(j1)): df1 = DataFrame(j1[i]['trace']) if not df1.empty: df1.rename(columns={'v': 'data', 't': 'time', 'q': 'qual_code'}, inplace=True) df1['data'] = to_numeric(df1['data'], errors='coerce') df1['time'] = to_datetime(df1['time'], format='%Y%m%d%H%M%S') df1['qual_code'] = to_numeric(df1['qual_code'], errors='coerce', downcast='integer') df1['site'] = sites[i] df2 = df1[df1.qual_code.isin(qual_codes)] out1 = concat([out1, df2]) out2 = out1.set_index(['site', 'time'])[['data', 'qual_code']] return out2
def flow_sites_to_shp(sites='All', min_flow_only=False, export=False, export_path='sites.shp'): """ Function to create a geopandas/shapefile from flow sites. """ ### Import from databases if min_flow_only: min_flow_sites = rd_sql('SQL2012PROD05', 'Wells', '"vMinimumFlowSites+Consent+Well_classes"', col_names=[ 'RefDbase', 'RefDbaseKey', 'restrictionType', 'RecordNo', 'WellNo' ], where_col='RefDbase', where_val=['Gauging', 'Hydrotel']) min_flow_sites.columns = ['type', 'site', 'restr', 'crc', 'wap'] min_flow_sites['site'] = min_flow_sites['site'].astype(int) min_flow_sites = min_flow_sites[min_flow_sites.restr == 'LowFlow'] site_geo = rd_sql('SQL2012PROD05', 'GIS', 'vGAUGING_NZTM', col_names=['SiteNumber', 'RIVER', 'SITENAME'], geo_col=True) site_geo.columns = ['site', 'river', 'site_name', 'geometry'] site_geo['river'] = site_geo.river.apply(lambda x: x.title()) site_geo['site_name'] = site_geo.site_name.apply(lambda x: x.title()) site_geo['site_name'] = site_geo.site_name.apply( lambda x: x.replace(' (Recorder)', '')) site_geo['site_name'] = site_geo.site_name.apply( lambda x: x.replace('Sh', 'SH')) site_geo['site_name'] = site_geo.site_name.apply( lambda x: x.replace('Ecs', 'ECS')) ### Select sites if type(sites) is str: if sites is 'All': sites_sel_geo = site_geo elif sites.endswith('.shp'): poly = read_file(sites) sites_sel_geo = sel_sites_poly(poly, site_geo) else: raise ValueError('If sites is a str, then it must be a shapefile.') else: sites_sel = select_sites(sites).astype('int32') sites_sel_geo = site_geo[in1d(site_geo.site, sites_sel)] if min_flow_only: sites_sel_geo = sites_sel_geo[in1d(sites_sel_geo.site, min_flow_sites.site.values)] ### Export and return if export: sites_sel_geo.to_file(export_path) return (sites_sel_geo)
def rd_hydstra_db(sites, start=0, end=0, datasource='A', data_type='mean', varfrom=100, varto=140, interval='day', multiplier=1, qual_codes=[30, 20, 10, 11, 21, 18], report_time=None, sites_chunk=20, print_sites=False, export_path=None): """ Wrapper function over hydllp to read in data from Hydstra's database. Must be run in a 32bit python. If either start_time or end_time is not 0, then they both need a date. Parameters ---------- sites : list, array, one column csv file, or dataframe Site numbers. start : str or int of 0 The start time in the format of either '2001-01-01' or 0 (for all data). end : str or int of 0 Same formatting as start. datasource : str Hydstra datasource code (usually 'A'). data_type : str mean, maxmin, max, min, start, end, first, last, tot, point, partialtot, or cum. varfrom : int or float The hydstra source data variable (100.00 is water level). varto : int or float The hydstra conversion data variable (140.00 is flow). interval : str The frequency of the output data (year, month, day, hour, minute, second, period). If data_type is 'point', then interval cannot be 'period' (use anything else, it doesn't matter). multiplier : int interval frequency. qual_codes : list of int The quality codes in Hydstra for filtering the data. sites_chunk : int Number of sites to request to hydllp at one time. Do not change unless you understand what it does. Return ------ DataFrame In long format with site and time as a MultiIndex. """ ### Process sites into workable chunks sites1 = select_sites(sites) n_chunks = ceil(len(sites1) / float(sites_chunk)) sites2 = array_split(sites1, n_chunks) ### Run instance of hydllp data = DataFrame() for i in sites2: if print_sites: print(i) ### Open connection hyd = openHyDb() with hyd as h: df = h.get_ts_traces(i, start=start, end=end, datasource=datasource, data_type=data_type, varfrom=varfrom, varto=varto, interval=interval, multiplier=multiplier, qual_codes=qual_codes, report_time=report_time) data = concat([data, df]) if isinstance(export_path, str): save_df(data, export_path) return (data)
def xy_to_gpd(id_col, x_col, y_col, df=None, crs=2193): """ Function to convert a DataFrame with x and y coordinates to a GeoDataFrame. Arguments:\n df -- Dataframe id_col -- the column(s) from the dataframe to be returned. Either a one name string or a list of column names.\n xcol -- Either the column name that has the x values within the df or an array of x values.\n ycol -- Same as xcol.\n crs -- The projection of the data. """ if type(x_col) is str: geometry = [Point(xy) for xy in zip(df[x_col], df[y_col])] else: x1 = select_sites(x_col) y1 = select_sites(y_col) geometry = [Point(xy) for xy in zip(x1, y1)] if isinstance(id_col, str) & (df is not None): id_data = df[id_col] elif isinstance(id_col, list): if df is not None: id_data = df[id_col] else: id_data = id_col elif isinstance(id_col, (ndarray, Series, Index)): id_data = id_col else: raise ValueError('id_data could not be determined') if isinstance(crs, int): crs1 = convert_crs(crs) elif isinstance(crs, (str, dict)): crs1 = crs else: raise ValueError('crs must be an int, str, or dict') gpd = GeoDataFrame(id_data, geometry=geometry, crs=crs1) return (gpd)
def get_ts_blockinfo(self, site_list, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01', fill_gaps=0, auditinfo=0): """ """ # Convert the site list to a comma delimited string of sites sites = select_sites(site_list).astype(str) site_list_str = ','.join([str(site) for site in sites]) ### Datetime conversion start = Timestamp(start).strftime('%Y%m%d%H%M%S') end = Timestamp(end).strftime('%Y%m%d%H%M%S') start_modified = Timestamp(start_modified).strftime('%Y%m%d%H%M%S') end_modified = Timestamp(end_modified).strftime('%Y%m%d%H%M%S') ### dict request ts_blockinfo_request = {"function": "get_ts_blockinfo", "version": 2, "params": {'site_list': site_list_str, 'datasources': datasources, 'variables': variables, 'starttime': start, 'endtime': end, 'start_modified': start_modified, 'end_modified': end_modified }} ts_blockinfo_result = self.query_by_dict(ts_blockinfo_request) blocks = ts_blockinfo_result['return']['blocks'] df1 = DataFrame(blocks) if df1.empty: return(df1) else: df1['endtime'] = to_datetime(df1['endtime'], format='%Y%m%d%H%M%S') df1['starttime'] = to_datetime(df1['starttime'], format='%Y%m%d%H%M%S') df1['variable'] = to_numeric(df1['variable'], errors='coerce', downcast='integer') df2 = df1[['site', 'datasource', 'variable', 'starttime', 'endtime']].sort_values(['site', 'variable', 'starttime']) df2.rename(columns={'datasource': 'data_source', 'variable': 'varto', 'starttime': 'from_mod_date', 'endtime': 'to_mod_date'}, inplace=True) return df2
def stream_nat( sites, catch_shp=r'P:\cant_catch_delin\recorders\catch_del.shp', include_gw=True, max_date='2015-06-30', sd_hdf='S:/Surface Water/shared/base_data/usage/sd_est_all_mon_vol.h5', flow_csv=None, crc_shp=r'S:\Surface Water\shared\GIS_base\vector\allocations\allo_gis.shp', catch_col='site', pivot=False, return_data=False, export_path=None): """ Function to naturalize stream flows from monthly sums of usage. sites -- A list of recorder sites to be naturalised.\n catch_shp -- A shapefile of the delineated catchments for all recorders.\n include_gw -- Should stream depleting GW takes be included?\n max_date -- The last date to be naturalised. In the form of '2015-06-30'.\n sd_hdf -- The hdf file of all the crc/waps with estimated usage and allocation.\n flow_csv -- If None, then use the hydro class to import the data. Otherwise, flow data can be imported as a csv file with the first column as datetime and each other column as a recorder site in m3/s. It can also be a dataframe.\n crc_shp -- A shapefile of all of th locations of the crc/waps.\n pivot -- Should the output be pivotted?\n return_data -- Should the allocation/usage time series be returned? """ qual_codes = [10, 18, 20, 50] ### Read in data ## Site numbers sites1 = select_sites(sites) ## Stream depletion sd = read_hdf(sd_hdf) if include_gw: sd1 = sd[sd.time <= max_date] else: sd1 = sd[(sd.take_type == 'Take Surface Water') & (sd.time <= max_date)] ## Recorder flow if type(flow_csv) is str: flow = rd_ts(flow_csv) flow.columns = flow.columns.astype(int) flow.index.name = 'time' flow.columns.name = 'site' flow = flow.stack() flow.name = 'flow' flow.index = flow.index.reorder_levels(['site', 'time']) flow = flow.sort_index() elif isinstance(flow_csv, DataFrame): flow = flow_csv.copy() flow.columns = flow.columns.astype(int) flow.index.name = 'time' flow.columns.name = 'site' flow = flow.stack() flow.name = 'flow' flow.index = flow.index.reorder_levels(['site', 'time']) flow = flow.sort_index() elif flow_csv is None: flow = hydro().get_data(mtypes='flow', sites=sites1, qual_codes=qual_codes) sites1 = flow.sites flow = flow.data flow.index = flow.index.droplevel('mtype') flow.name = 'flow' else: raise ValueError('Pass something useful to flow_csv.') ## crc shp crc_loc = read_file(crc_shp) crc_loc1 = merge(crc_loc[[ 'crc', 'take_type', 'allo_block', 'wap', 'use_type', 'geometry' ]], sd[['crc', 'take_type', 'allo_block', 'wap', 'use_type']].drop_duplicates(), on=['crc', 'take_type', 'allo_block', 'wap', 'use_type']) ## Catchment areas shp catch = read_file(catch_shp).drop('NZREACH', axis=1) catch = catch[catch[catch_col].isin(sites1)] ### Spatial processing of WAPs, catchments, and sites ## WAPs to catchments sjoin crc_catch, catch2 = pts_poly_join(crc_loc1, catch, catch_col) # id_areas = catch2.area.copy() # tot_areas = catch2.area.copy() # # ## Unique catchments/gauges ## sites = wap_catch[catch_col].unique() # sites2 = catch[catch_col].unique() ### Next data import ## Gaugings # gaugings = rd_henry(sites=sites.astype('int32'), agg_day=True, sites_by_col=True) # gaugings.columns = gaugings.columns.astype(int) ## site specific flow # rec_sites = flow.columns[in1d(flow.columns, sites)] # gauge_sites = sites[~in1d(sites, rec_sites)] # gauge_sites2 = gaugings.columns[in1d(gaugings.columns, gauge_sites)] # site_flow = flow[rec_sites] # gaugings = gaugings[gauge_sites2] ### filter down the sites sd1a = merge(crc_catch, sd1, on=['crc', 'take_type', 'allo_block', 'wap', 'use_type']).drop('geometry', axis=1) ### Remove excessive usages sd1a = sd1a[~((sd1a.sd_usage / sd1a.ann_restr_allo_m3 / 12) >= 1.5)] ### Calc SD for site and month sd2 = sd1a.groupby(['site', 'time'])['sd_usage'].sum().reset_index() days1 = sd2.time.dt.daysinmonth sd2['sd_rate'] = sd2.sd_usage / days1 / 24 / 60 / 60 ### Resample SD to daily time series days2 = to_timedelta((days1 / 2).round().astype('int32'), unit='D') sd3 = sd2.drop('sd_usage', axis=1) sd3.loc[:, 'time'] = sd3.loc[:, 'time'] - days2 grp1 = sd3.groupby(['site']) first1 = grp1.first() last1 = sd2.groupby('site')[['time', 'sd_rate']].last() first1.loc[:, 'time'] = to_datetime(first1.loc[:, 'time'].dt.strftime('%Y-%m') + '-01') sd4 = concat([first1.reset_index(), sd3, last1.reset_index() ]).reset_index(drop=True).sort_values(['site', 'time']) sd5 = sd4.set_index('time') sd6 = sd5.groupby('site').apply( lambda x: x.resample('D').interpolate(method='pchip'))['sd_rate'] ### Naturalise flows nat1 = concat([flow, sd6], axis=1, join='inner') nat1['nat_flow'] = nat1['flow'] + nat1['sd_rate'] ## Normalize to area if desired # if norm_area: # # recorder flow in mm/day # site_order = tot_areas[flow1.columns].values / 60 / 60 / 24 / 1000 # flow_norm = flow1.div(site_order) # nat_flow_norm = nat_flow.div(site_order) # # # Gauges flow in mm/day # site_order = tot_areas[gaugings1.columns].values / 60 / 60 / 24 / 1000 # gaugings_norm = gaugings1.div(site_order) # nat_gauge_norm = nat_gauge.div(site_order) # # ### Export and return results # if export: # nat_flow_norm.to_csv(export_rec_flow_path) # nat_gauge_norm.to_csv(export_gauge_flow_path) # return([flow_norm, gaugings_norm, nat_flow_norm, nat_gauge_norm]) # else: # if export: # nat_flow.to_csv(export_rec_flow_path) # nat_gauge.to_csv(export_gauge_flow_path) # return([flow1, gaugings1, nat_flow, nat_gauge]) if pivot: nat2 = nat1.round(3).unstack('site') else: nat2 = nat1.round(3) if isinstance(export_path, str): save_df(nat2, export_path) if return_data: return (nat2, sd1a) else: return (nat2)
def rd_henry(sites, from_date=None, to_date=None, agg_day=True, sites_by_col=False, min_filter=None, export=False, export_path='gauge_flows.csv'): """ Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer. Arguments:\n sites -- Either a list of site names or a file path string that contains a column of site names.\n sites_col -- If 'sites' is a path string, then the column that contains site names.\n from_date -- A date string for the start of the data (e.g. '2010-01-01').\n to_date -- A date string for the end of the data.\n agg_day -- Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean.\n sites_by_col -- 'False' does not make a time series, rather it is organized by site, date, and gauging. 'True' creates a time series with the columns as gauging sites (will create many NAs).\n min_filter -- Minimum number of days required for the gaugings output. """ def resample1(df): df.index = df.date df2 = df.resample('D').mean() return (df2) #### Fields and names for databases ## Query fields - Be sure to use single quotes for the names!!! fields = ['SiteNo', 'SampleDate', 'Flow'] ## Equivelant short names for analyses - Use these names!!! names = ['site', 'date', 'flow'] #### Databases ### Gaugings data server = 'SQL2012PROD03' database = 'DataWarehouse' table = 'DataWarehouse.dbo.F_SG_BGauging' where_col = 'SiteNo' ## Will change to the following!!! Or stay as a duplicate... # database1 = 'Hydstra' # table1 = 'Hydstra.dbo.GAUGINGS' ######################################## ### Read in data sites1 = select_sites(sites).tolist() data = rd_sql(server=server, database=database, table=table, col_names=fields, where_col=where_col, where_val=sites1).dropna() data.columns = names ### Aggregate duplicates data2 = data.groupby(['site', 'date']).mean().reset_index() ### Aggregate by day if agg_day: data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna() else: data3 = data2 ### Filter out sites with less than min_filter if min_filter is not None: count1 = data3.groupby('site')['flow'].count() count_index = count1[count1 >= min_filter].index data3 = data3[in1d(data3.site.values, count_index)] ### Select within date range if from_date is not None: data3 = data3[data3.date >= from_date] if to_date is not None: data3 = data3[data3.date <= to_date] ### reorganize data with sites as columns and dates as index if sites_by_col: data4 = data3.pivot(index='date', columns='site').xs('flow', axis=1).round(4) else: data4 = data3.round(4) if export: if sites_by_col: data4.to_csv(export_path) else: data4.to_csv(export_path, index=False) return (data4)
def rd_hydrotel(sites, mtype='river_flow_cont_raw', from_date=None, to_date=None, resample_code='D', period=1, fun='mean', val_round=3, min_count=None, pivot=False, export_path=None): """ Function to extract time series data from the hydrotel database. Parameters ---------- sites : list, array, dataframe, or str Site list or a str path to a single column csv file of site names/numbers. mtype : str 'flow_tel', 'gwl_tel', 'precip_tel', 'swl_tel', or 'wtemp_tel'. from_date : str or None The start date in the format '2000-01-01'. to_date : str or None The end date in the format '2000-01-01'. resample_code : str The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc. period : int The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period. fun : str The resampling function. i.e. mean, sum, count, min, or max. No median yet... val_round : int The number of decimals to round the values. pivot : bool Should the output be pivotted into wide format? export_path : str or None The path and file name to be saved. Returns ------- Series or DataFrame A MultiIndex Pandas Series if pivot is False and a DataFrame if True """ #### mtypes dict mtypes_dict = {'river_flow_cont_raw': 'Flow Rate', 'aq_wl_cont_raw': 'Water Level', 'atmos_precip_cont_raw': 'Rainfall Depth', 'river_wl_cont_raw': 'Water Level', 'river_wtemp_cont_raw': 'Water Temperature'} #### Database parameters server = 'SQL2012PROD05' database = 'Hydrotel' data_tab = 'Hydrotel.dbo.Samples' points_tab = 'Hydrotel.dbo.Points' objects_tab = 'Hydrotel.dbo.Objects' mtypes_tab = 'Hydrotel.dbo.ObjectVariants' sites_tab = 'Hydrotel.dbo.Sites' data_col = ['Point', 'DT', 'SampleValue'] points_col = ['Point', 'Object'] objects_col = ['Object', 'Site', 'Name', 'ObjectVariant'] mtypes_col = ['ObjectVariant', 'Name'] sites_col = ['Site', 'Name', 'ExtSysId'] #### Import data and select the correct sites sites = select_sites(sites) if mtype == 'atmos_precip_cont_raw': site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'], 'ExtSysId', sites.astype('int32').tolist()) site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'], 'Site', site_ob1.Site.tolist()) site_val1 = merge(site_val0, site_ob1, on='Site') elif mtype == 'aq_wl_cont_raw': site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name']) site_val0.loc[:, 'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0], axis=1) site_val1 = site_val0[site_val0.Name.isin(sites)] site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name'] else: site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId', sites.astype('int32').tolist()) if site_val1.empty: raise ValueError('No site(s) in database') site_val1.loc[:, 'ExtSysId'] = to_numeric(site_val1.loc[:, 'ExtSysId'], errors='ignore') site_val = site_val1.Site.astype('int32').tolist() if isinstance(mtype, (list, ndarray, Series)): mtypes = [mtypes_dict[i] for i in mtype] elif isinstance(mtype, str): mtypes = [mtypes_dict[mtype]] else: raise ValueError('mtype must be a str, list, ndarray, or Series.') mtypes_val = rd_sql(server, database, mtypes_tab, mtypes_col, 'Name', mtypes) where_col = {'Site': site_val, 'ObjectVariant': mtypes_val.ObjectVariant.astype('int32').tolist()} object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col) if mtype == 'gwl_tel': object_val1 = object_val1[object_val1.Name == 'Water Level'] if mtype == 'precip_tel': object_val1 = object_val1[object_val1.Name == 'Rainfall'] object_val = object_val1.Object.values.astype(int).tolist() #### Rearrange data point_val1 = rd_sql(server, database, points_tab, points_col, where_col='Object', where_val=object_val) point_val = point_val1.Point.values.astype(int).tolist() #### Big merge comp_tab1 = merge(site_val1, object_val1[['Object', 'Site']], on='Site') comp_tab2 = merge(comp_tab1, point_val1, on='Object') comp_tab2.set_index('Point', inplace=True) #### Pull out the data ### Make SQL statement data1 = rd_sql_ts(server, database, data_tab, 'Point', 'DT', 'SampleValue', resample_code, period, fun, val_round, {'Point': point_val}, from_date=from_date, to_date=to_date, min_count=min_count)['SampleValue'] data1.index.names = ['site', 'time'] data1.name = 'value' site_numbers = [comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0]] data1.index.set_levels(site_numbers, level='site', inplace=True) if pivot: data3 = data1.unstack(0) else: data3 = data1 #### Export and return if export_path is not None: save_df(data3, export_path) return data3
def rd_squalarc(sites, mtypes=None, from_date=None, to_date=None, convert_dtl=False, dtl_method=None, export_path=None): """ Function to read in "squalarc" data. Which is atually stored in the mssql db. Arguments:\n sites -- The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest.\n mtypes -- A list of measurement type names to be in the output. Leaving it empty returns all mtypes.\n from_date -- A start date string in of '2010-01-01'.\n to_date -- A end date string in of '2011-01-01'.\n convert_dtl -- Should values under the detection limit be converted to numeric?\n dtl_method -- The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit. """ #### Read in sites sites1 = select_sites(sites) #### Extract by polygon if isinstance(sites1, GeoDataFrame): ## Surface water sites sw_sites_tab = rd_sql('SQL2012PROD05', 'Squalarc', 'SITES', col_names=['SITE_ID', 'NZTMX', 'NZTMY']) sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab) sites1a = sites1.to_crs(gdf_sw_sites.crs) sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry', axis=1) ## Groundwater sites gw_sites_tab = rd_sql('SQL2012PROD05', 'Wells', 'WELL_DETAILS', col_names=['WELL_NO', 'NZTMX', 'NZTMY']) gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab) gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry', axis=1) sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist() else: sites2 = Series(sites1, name='site').astype(str).tolist() #### Extract the rest of the data if len(sites2) > 10000: n_chunks = int(ceil(len(sites2) * 0.0001)) sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)] samples_tab = DataFrame() for i in sites3: samples_tab1 = rd_sql('SQL2012PROD05', 'Squalarc', '"SQL_SAMPLE_METHODS+"', col_names=[ 'Site_ID', 'SAMPLE_NO', 'ME_TYP', 'Collect_Date', 'Collect_Time', 'PA_NAME', 'PARAM_UNITS', 'SRESULT' ], where_col='Site_ID', where_val=i) samples_tab1.columns = [ 'site', 'sample_id', 'source', 'date', 'time', 'parameter', 'units', 'val' ] samples_tab1.loc[:, 'source'] = samples_tab1.loc[:, 'source'].str.lower( ) samples_tab = concat([samples_tab, samples_tab1]) else: samples_tab = rd_sql('SQL2012PROD05', 'Squalarc', '"SQL_SAMPLE_METHODS+"', col_names=[ 'Site_ID', 'SAMPLE_NO', 'ME_TYP', 'Collect_Date', 'Collect_Time', 'PA_NAME', 'PARAM_UNITS', 'SRESULT' ], where_col='Site_ID', where_val=sites2) samples_tab.columns = [ 'site', 'sample_id', 'source', 'date', 'time', 'parameter', 'units', 'val' ] samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower() samples_tab2 = samples_tab.copy() num_test = to_numeric(samples_tab2.loc[:, 'time'], 'coerce') samples_tab2.loc[num_test.isnull(), 'time'] = '0000' samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('.', '') samples_tab2 = samples_tab2[samples_tab2.date.notnull()] # samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000') time1 = to_datetime(samples_tab2.time, format='%H%M', errors='coerce') time1[time1.isnull()] = Timestamp('2000-01-01 00:00:00') datetime1 = to_datetime( samples_tab2.date.dt.date.astype(str) + ' ' + time1.dt.time.astype(str)) samples_tab2.loc[:, 'date'] = datetime1 samples_tab2 = samples_tab2.drop('time', axis=1) samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = nan samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = nan #### Select within time range if isinstance(from_date, str): samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date] if isinstance(to_date, str): samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date] if mtypes is not None: mtypes1 = select_sites(mtypes) data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index( drop=True) else: data = samples_tab2.reset_index(drop=True) #### Correct poorly typed in site names data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '') #### Convert detection limit values if convert_dtl: less1 = data['val'].str.match('<') if less1.sum() > 0: less1.loc[less1.isnull()] = False data2 = data.copy() data2.loc[less1, 'val'] = to_numeric(data.loc[less1, 'val'].str.replace( '<', ''), errors='coerce') * 0.5 if dtl_method in (None, 'standard'): data3 = data2 if dtl_method == 'trend': df1 = data2.loc[less1] count1 = data.groupby('parameter')['val'].count() count1.name = 'tot_count' count_dtl = df1.groupby('parameter')['val'].count() count_dtl.name = 'dtl_count' count_dtl_val = df1.groupby('parameter')['val'].nunique() count_dtl_val.name = 'dtl_val_count' combo1 = concat([count1, count_dtl, count_dtl_val], axis=1, join='inner') combo1['dtl_ratio'] = (combo1['dtl_count'] / combo1['tot_count']).round(2) ## conditionals # param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)] # under_40 = data['parameter'].isin(param1.index) param2 = combo1[(combo1['dtl_ratio'] > 0.4) & (combo1['dtl_val_count'] != 1)] over_40 = data['parameter'].isin(param2.index) ## Calc detection limit values data3 = merge(data, combo1['dtl_ratio'].reset_index(), on='parameter', how='left') data3.loc[:, 'val_dtl'] = data2['val'] max_dtl_val = data2[over_40 & less1].groupby( 'parameter')['val'].transform('max') max_dtl_val.name = 'dtl_val_max' data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val else: data3 = data else: data3 = data #### Return and export if isinstance(export_path, str): data3.to_csv(export_path, encoding='utf-8', index=False) return (data3)
def rd_ht_wq_data(hts, sites=None, mtypes=None, start=None, end=None, dtl_method=None, output_site_data=False, mtype_params=None, sample_params=None): """ Function to read data from an hts file and optionally select specific sites and aggregate the data. Parameters ---------- hts : str Path to the hts file. sites : list A list of site names within the hts file. mtypes : list A list of measurement types that should be returned. start : str The start date to retreive from the data in ISO format (e.g. '2011-11-30 00:00'). end : str The end date to retreive from the data in ISO format (e.g. '2011-11-30 00:00'). dtl_method : None, 'standard', 'trend' The method to use to convert values under a detection limit to numeric. None does no conversion. 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit. output_site_data : bool Should the site data be output? Returns ------- DataFrame """ # agg_unit_dict = {'l/s': 1, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4} # unit_convert = {'l/s': 0.001, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4} sites1 = select_sites(sites) #### Extract by polygon if isinstance(sites1, GeoDataFrame): ## Surface water sites sw_sites_tab = rd_sql('SQL2012PROD05', 'Squalarc', 'SITES', col_names=['SITE_ID', 'NZTMX', 'NZTMY']) sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab) sites1a = sites1.to_crs(gdf_sw_sites.crs) sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry', axis=1) ## Groundwater sites gw_sites_tab = rd_sql('SQL2012PROD05', 'Wells', 'WELL_DETAILS', col_names=['WELL_NO', 'NZTMX', 'NZTMY']) gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab) gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry', axis=1) sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist() else: sites2 = sites1 ### First read all of the sites in the hts file and select the ones to be read sites_df = rd_hilltop_sites(hts, sites=sites2, mtypes=mtypes, rem_wq_sample=False) ### Open the hts file wqr = Dispatch("Hilltop.WQRetrieval") dfile = Dispatch("Hilltop.DataFile") try: dfile.Open(hts) except ValueError: print(dfile.errmsg) ### Iterate through he hts file df_lst = [] for i in sites_df.index: site = sites_df.loc[i, 'site'] mtype = sites_df.loc[i, 'mtype'] if mtype == 'WQ Sample': continue wqr = dfile.FromWQSite(site, mtype) ## Set up start and end times and aggregation initiation if (start is None): start1 = wqr.DataStartTime else: start1 = start if end is None: end1 = wqr.DataEndTime else: end1 = end wqr.FromTimeRange(start1, end1) ## Extract data data = [] time = [] test_params = sites_df[sites_df.site == site].mtype.unique() if ('WQ Sample' in test_params) & (isinstance(mtype_params, list) | isinstance(sample_params, list)): sample_p = [] mtype_p = [] while wqr.GetNext: data.append(wqr.value) time.append(str(pytime_to_datetime(wqr.time))) sample_p.append({ sp: wqr.params(sp).encode('ascii', 'ignore') for sp in sample_params }) mtype_p.append({ mp: wqr.params(mp).encode('ascii', 'ignore') for mp in mtype_params }) else: while wqr.GetNext: data.append(wqr.value) time.append(str(pytime_to_datetime(wqr.time))) if data: df_temp = DataFrame({ 'time': time, 'data': data, 'site': site, 'mtype': mtype }) if sample_p: df_temp = concat( [df_temp, DataFrame(sample_p), DataFrame(mtype_p)], axis=1) df_lst.append(df_temp) dfile.Close() wqr.close() if df_lst: data = concat(df_lst) data.loc[:, 'time'] = to_datetime(data.loc[:, 'time']) data1 = to_numeric(data.loc[:, 'data'], errors='coerce') data.loc[data1.notnull(), 'data'] = data1[data1.notnull()] # data.loc[:, 'data'].str.replace('*', '') data = data.reset_index(drop=True) #### Convert detection limit values if dtl_method is not None: less1 = data['data'].str.match('<') if less1.sum() > 0: less1.loc[less1.isnull()] = False data2 = data.copy() data2.loc[less1, 'data'] = to_numeric( data.loc[less1, 'data'].str.replace('<', ''), errors='coerce') * 0.5 if dtl_method == 'standard': data3 = data2 if dtl_method == 'trend': df1 = data2.loc[less1] count1 = data.groupby('mtype')['data'].count() count1.name = 'tot_count' count_dtl = df1.groupby('mtype')['data'].count() count_dtl.name = 'dtl_count' count_dtl_val = df1.groupby('mtype')['data'].nunique() count_dtl_val.name = 'dtl_val_count' combo1 = concat([count1, count_dtl, count_dtl_val], axis=1, join='inner') combo1['dtl_ratio'] = (combo1['dtl_count'] / combo1['tot_count']).round(2) ## conditionals param2 = combo1[(combo1['dtl_ratio'] > 0.4) & (combo1['dtl_val_count'] != 1)] over_40 = data['mtype'].isin(param2.index) ## Calc detection limit values data3 = merge(data, combo1['dtl_ratio'].reset_index(), on='mtype', how='left') data3.loc[:, 'data_dtl'] = data2['data'] max_dtl_val = data2[over_40 & less1].groupby( 'mtype')['data'].transform('max') max_dtl_val.name = 'dtl_data_max' data3.loc[over_40 & less1, 'data_dtl'] = max_dtl_val else: data3 = data else: data3 = data if output_site_data: sites_df = sites_df[~(sites_df.mtype == 'WQ Sample')] return (data3, sites_df) else: return (data3)
def rd_hilltop_sites(hts, sites=None, mtypes=None, rem_wq_sample=True): """ Function to read the site names, measurement types, and units of a Hilltop hts file. Returns a DataFrame. Arguments:\n hts -- Path to the hts file (str).\n sites -- A list of site names within the hts file.\n mtypes -- A list of measurement types that should be returned. """ if sites is not None: sites = select_sites(sites) if mtypes is not None: mtypes = select_sites(mtypes) cat = Dispatch("Hilltop.Catalogue") if not cat.Open(hts): raise ValueError(cat.errmsg) dfile = Dispatch("Hilltop.DataRetrieval") try: dfile.Open(hts) except ValueError: print(dfile.errmsg) sites_lst = [] ### Iterate through all sites/datasources/mtypes cat.StartSiteEnum while cat.GetNextSite: site_name = cat.SiteName if sites is None: pass elif site_name in sites: pass else: continue while cat.GetNextDataSource: ds_name = cat.DataSource try: start1 = pytime_to_datetime(cat.DataStartTime) end1 = pytime_to_datetime(cat.DataEndTime) except ValueError: bool_site = dfile.FromSite(site_name, ds_name, 1) if bool_site: start1 = pytime_to_datetime(cat.DataStartTime) end1 = pytime_to_datetime(cat.DataEndTime) else: print('No site data for ' + site_name + '...for some reason...') while cat.GetNextMeasurement: mtype1 = cat.Measurement if mtype1 == 'Item2': continue elif mtypes is None: pass elif mtype1 in mtypes: pass else: continue divisor = cat.Divisor unit1 = cat.Units if unit1 == '%': # print('Site ' + name1 + ' has no units') unit1 = '' sites_lst.append([ site_name, ds_name, mtype1, unit1, divisor, str(start1), str(end1) ]) sites_df = DataFrame(sites_lst, columns=[ 'site', 'data_source', 'mtype', 'unit', 'divisor', 'start_date', 'end_date' ]) if rem_wq_sample: sites_df = sites_df[~(sites_df.mtype == 'WQ Sample')] dfile.Close() cat.Close() return (sites_df)