def xy_to_gpd(id_col, x_col, y_col, df=None, crs=2193): """ Function to convert a DataFrame with x and y coordinates to a GeoDataFrame. Parameters ---------- df: Dataframe The DataFrame with the location data. id_col: str or list of str The column(s) from the dataframe to be returned. Either a one name string or a list of column names. xcol: str or ndarray Either the column name that has the x values within the df or an array of x values. ycol: str or ndarray Same as xcol except for y. crs: int The projection of the data. Returns ------- GeoDataFrame Of points. """ if type(x_col) is str: geometry = [Point(xy) for xy in zip(df[x_col], df[y_col])] else: x1 = select_sites(x_col) y1 = select_sites(y_col) geometry = [Point(xy) for xy in zip(x1, y1)] if isinstance(id_col, str) & (df is not None): id_data = df[id_col] elif isinstance(id_col, list): if df is not None: id_data = df[id_col] else: id_data = id_col elif isinstance(id_col, (np.ndarray, pd.Series, pd.Index)): id_data = id_col else: raise ValueError('id_data could not be determined') if isinstance(crs, int): crs1 = convert_crs(crs) elif isinstance(crs, (str, dict)): crs1 = crs else: raise ValueError('crs must be an int, str, or dict') gpd1 = gpd.GeoDataFrame(id_data, geometry=geometry, crs=crs1) return gpd1
def hydstra_site_mod_time(sites=None): """ Function to extract modification times from Hydstra data archive files. Returns a DataFrame of sites by modification date. The modification date is in GMT. Parameters ---------- sites : list, array, Series, or None If sites is not None, then return only the given sites. Returns ------- DataFrame """ site_files_path = r'\\fileservices02\ManagedShares\Data\Hydstra\prod\hyd\dat\hyd' files1 = rd_dir(site_files_path, 'A') file_sites = [os.path.splitext(i)[0] for i in files1] if sites is not None: sites1 = select_sites(sites).astype(str) sites2 = [i.replace('/', '_') for i in sites1] file_sites1 = [i for i in file_sites if i in sites2] else: file_sites1 = file_sites mod_times = pd.to_datetime([round(os.path.getmtime(os.path.join(site_files_path, i + '.A'))) for i in file_sites1], unit='s') df = pd.DataFrame({'site': file_sites1, 'mod_time': mod_times}) return df
def rd_blocklist(sites, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01'): """ Wrapper function to extract info about when data has changed between modification dates. Parameters ---------- sites : list, array, one column csv file, or dataframe Site numbers. datasource : list of str Hydstra datasource code (usually ['A']). variables : list of int or float The hydstra conversion data variable (140.00 is flow). start : str The start time in the format of '2001-01-01'. end : str Same formatting as start. start_modified: str The starting date of the modification. end_modified: str The ending date of the modification. Returns ------- DataFrame With site, data_source, varto, from_mod_date, and to_mod_date. """ ### Process sites sites1 = select_sites(sites).tolist() ### Open connection hyd = openHyDb() with hyd as h: df = h.get_ts_blockinfo(sites1, start=start, end=end, datasources=datasources, variables=variables, start_modified=start_modified, end_modified=end_modified) return df
def get_ts_traces(self, site_list, start=0, end=0, varfrom=100, varto=140, interval='day', multiplier=1, datasource='A', data_type='mean', qual_codes=[30, 20, 10, 11, 21, 18], report_time=None): """ """ # Convert the site list to a comma delimited string of sites sites = select_sites(site_list).astype(str) site_list_str = ','.join([str(site) for site in sites]) ### Datetime conversion - with dates < 1900 c1900 = pd.Timestamp('1900-01-01') if start != 0: start1 = pd.Timestamp(start) if start1 > c1900: start = start1.strftime('%Y%m%d%H%M%S') else: start = start1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '') if end != 0: end1 = pd.Timestamp(end) if end1 > c1900: end = end1.strftime('%Y%m%d%H%M%S') else: end = end1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '') ts_traces_request = {'function': 'get_ts_traces', 'version': 2, 'params': {'site_list': site_list_str, 'start_time': start, 'end_time': end, 'varfrom': varfrom, 'varto': varto, 'interval': interval, 'datasource': datasource, 'data_type': data_type, 'multiplier': multiplier, 'report_time': report_time}} ts_traces_request = self.query_by_dict(ts_traces_request) j1 = ts_traces_request['return']['traces'] ### Convert json to a dataframe sites = [str(f['site']) for f in j1] out1 = pd.DataFrame() for i in range(len(j1)): df1 = pd.DataFrame(j1[i]['trace']) if not df1.empty: df1.rename(columns={'v': 'data', 't': 'time', 'q': 'qual_code'}, inplace=True) df1['data'] = pd.to_numeric(df1['data'], errors='coerce') df1['time'] = pd.to_datetime(df1['time'], format='%Y%m%d%H%M%S') df1['qual_code'] = pd.to_numeric(df1['qual_code'], errors='coerce', downcast='integer') df1['site'] = sites[i] df2 = df1[df1.qual_code.isin(qual_codes)] out1 = pd.concat([out1, df2]) out2 = out1.set_index(['site', 'time'])[['data', 'qual_code']] return out2
def rd_hydstra_db(sites, start=0, end=0, datasource='A', data_type='mean', varfrom=100, varto=140, interval='day', multiplier=1, qual_codes=[30, 20, 10, 11, 21, 18], report_time=None, sites_chunk=20, print_sites=False, export_path=None): """ Wrapper function over hydllp to read in data from Hydstra's database. Must be run in a 32bit python. If either start_time or end_time is not 0, then they both need a date. Parameters ---------- sites : list, array, one column csv file, or dataframe Site numbers. start : str or int of 0 The start time in the format of either '2001-01-01' or 0 (for all data). end : str or int of 0 Same formatting as start. datasource : str Hydstra datasource code (usually 'A'). data_type : str mean, maxmin, max, min, start, end, first, last, tot, point, partialtot, or cum. varfrom : int or float The hydstra source data variable (100.00 is water level). varto : int or float The hydstra conversion data variable (140.00 is flow). interval : str The frequency of the output data (year, month, day, hour, minute, second, period). If data_type is 'point', then interval cannot be 'period' (use anything else, it doesn't matter). multiplier : int interval frequency. qual_codes : list of int The quality codes in Hydstra for filtering the data. sites_chunk : int Number of sites to request to hydllp at one time. Do not change unless you understand what it does. Return ------ DataFrame In long format with site and time as a MultiIndex. """ ### Process sites into workable chunks sites1 = select_sites(sites) n_chunks = np.ceil(len(sites1) / float(sites_chunk)) sites2 = np.array_split(sites1, n_chunks) ### Run instance of hydllp data = pd.DataFrame() for i in sites2: if print_sites: print(i) ### Open connection hyd = openHyDb() with hyd as h: df = h.get_ts_traces(i, start=start, end=end, datasource=datasource, data_type=data_type, varfrom=varfrom, varto=varto, interval=interval, multiplier=multiplier, qual_codes=qual_codes, report_time=report_time) data = pd.concat([data, df]) if isinstance(export_path, str): save_df(data, export_path) return data
def get_ts_blockinfo(self, site_list, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01', fill_gaps=0, auditinfo=0): """ """ # Convert the site list to a comma delimited string of sites sites = select_sites(site_list).astype(str) site_list_str = ','.join([str(site) for site in sites]) ### Datetime conversion start = pd.Timestamp(start).strftime('%Y%m%d%H%M%S') end = pd.Timestamp(end).strftime('%Y%m%d%H%M%S') start_modified = pd.Timestamp(start_modified).strftime('%Y%m%d%H%M%S') end_modified = pd.Timestamp(end_modified).strftime('%Y%m%d%H%M%S') ### dict request ts_blockinfo_request = {"function": "get_ts_blockinfo", "version": 2, "params": {'site_list': site_list_str, 'datasources': datasources, 'variables': variables, 'starttime': start, 'endtime': end, 'start_modified': start_modified, 'end_modified': end_modified }} ts_blockinfo_result = self.query_by_dict(ts_blockinfo_request) blocks = ts_blockinfo_result['return']['blocks'] df1 = pd.DataFrame(blocks) if df1.empty: return(df1) else: df1['endtime'] = pd.to_datetime(df1['endtime'], format='%Y%m%d%H%M%S') df1['starttime'] = pd.to_datetime(df1['starttime'], format='%Y%m%d%H%M%S') df1['variable'] = pd.to_numeric(df1['variable'], errors='coerce', downcast='integer') df2 = df1[['site', 'datasource', 'variable', 'starttime', 'endtime']].sort_values(['site', 'variable', 'starttime']) df2.rename(columns={'datasource': 'data_source', 'variable': 'varto', 'starttime': 'from_mod_date', 'endtime': 'to_mod_date'}, inplace=True) return df2
def crc_band_flow(site_lst=None, crc_lst=None, names=False): """ Function to determine the min flow conditions for each flow site, band, and crc. """ ### Database parameters # crc, sites, and bands server = 'SQL2012PROD03' database = 'LowFlows' crc_table = 'vLowFlowConsents2' # id and gauge site gauge_table = 'LowFlowSite' # Internal site id, band, and min flow min_flow_table = 'LowFlowSiteBandPeriodAllocation' ## fields and associated column names crc_fields = ['SiteID', 'BandNo', 'RecordNo'] crc_names = ['id', 'band', 'crc'] if names: gauge_fields = ['SiteID', 'RefDBaseKey', 'Waterway', 'Location'] gauge_names = ['id', 'site', 'Waterway', 'Location'] else: gauge_fields = ['SiteID', 'RefDBaseKey'] gauge_names = ['id', 'site'] min_flow_fields = ['SiteID', 'BandNo', 'PeriodNo', 'Allocation', 'Flow'] min_flow_names = ['id', 'band', 'mon', 'allo', 'min_flow'] ### Load in data crc = rd_sql(server, database, crc_table, crc_fields) crc['crc'] = crc['crc'].str.strip() crc.columns = crc_names gauge = rd_sql(server, database, gauge_table, gauge_fields) gauge.columns = gauge_names min_flow = rd_sql(server, database, min_flow_table, min_flow_fields) min_flow.columns = min_flow_names ### Remove min flows that are not restricted min_flow1 = min_flow[min_flow.allo < 100] ### Lots of table merges! crc_min_flow = pd.merge(crc, min_flow1, on=['id', 'band']) crc_min_gauge = pd.merge(gauge, crc_min_flow, on='id').drop('id', axis=1) ### Query results if crc_lst is not None: crc_sel = select_sites(crc_lst) sel1 = crc_min_gauge[np.in1d(crc_min_gauge.crc, crc_sel)] else: sel1 = crc_min_gauge if site_lst is not None: site_sel = select_sites(site_lst).astype(str) sel2 = sel1[np.in1d(sel1.site, site_sel)] else: sel2 = sel1 return sel2
def flow_ros(select=all, start_date='1900-01-01', end_date='2016-06-30', fill_na=False, flow_csv='S:/Surface Water/shared/base_data/flow/flow_data.csv', min_flow_cond_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_cond.csv', min_flow_id_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_id.csv', min_flow_mon_csv='S:/Surface Water/shared/base_data/usage/restrictions/mon_min_flow.csv', min_flow_restr_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_restr.csv'): """ Function to estimate the percent allowable abstraction per band_id. Arguments:\n select -- Either a list, array, dataframe, or signle column csv file of site numbers.\n start_date / end_date -- The start and/or end date for the results as a string.\n *_csv -- csv files necessary for the analysis. """ def norm_eval(series): if series['lower'] == '0': lower1 = '-1' else: lower1 = series['lower'] stmt = '(' + series['object'] + '[' + str(int(series['site'])) + ']' + ' <= ' + series['upper'] + ')' + ' & ' + '(' + series['object'] + '[' + str(int(series['site'])) + ']' + ' > ' + lower1 + ')' return(stmt) def stmt_set(norm_conds, other_conds): if (len(norm_conds) > 0) & (len(other_conds) > 0): max1 = norm_conds.ix[norm_conds.index[-1], 'upper'] new1 = norm_conds.ix[0, :] new1.loc['upper'] = '100000' new1.loc['lower'] = max1 new1.loc['cond_id'] = 0 norm_conds.loc['a', :] = new1 stmt = [norm_eval(norm_conds.loc[x,:]) for x in norm_conds.index] other_stmt = other_conds.other.tolist() stmt.append(other_stmt) ids = norm_conds.cond_id.tolist() ids.append(other_conds.cond_id) elif (len(norm_conds) > 0): max1 = norm_conds.ix[norm_conds.index[-1], 'upper'] new1 = norm_conds.iloc[0, :] new1.loc['upper'] = '100000' new1.loc['lower'] = max1 new1.loc['cond_id'] = 0 norm_conds.loc['a', :] = new1 stmt = [norm_eval(norm_conds.loc[x,:]) for x in norm_conds.index] ids = norm_conds.cond_id.tolist() elif (len(other_conds) > 0): stmt = other_conds.other.tolist() ids = other_conds.cond_id.tolist() return([stmt, ids]) def pro_rata(flow, lower, upper): perc = (flow - lower) * 100 / (upper - lower) perc[perc < 0] = 0 return(perc) ### Read in data tables min_flow_cond = pd.read_csv(min_flow_cond_csv).dropna(how='all') min_flow_id = pd.read_csv(min_flow_id_csv).dropna(how='all') min_flow_mon = pd.read_csv(min_flow_mon_csv).dropna(how='all') min_flow_restr = pd.read_csv(min_flow_restr_csv).dropna(how='all') if type(flow_csv) is str: flow1 = pd.read_csv(flow_csv) flow1.loc[:, 'time'] = pd.to_datetime(flow1.loc[:, 'time']) flow = flow1.pivot_table('data', 'time', 'site') else: flow = flow_csv flow.columns = flow.columns.astype('int32') ### Select specific site bands if select is not all: bands1 = select_sites(select).astype(str) min_flow_id = min_flow_id[np.in1d(min_flow_id.site.astype(str), bands1)] ### Add in additional data from hydrotel if needed if sum(min_flow_id.site == 69607) > 0: hydrotel_flow_sites = [696501] hydrotel_wl_sites = [69660] opuha_flow = rd_hydrotel(hydrotel_flow_sites, mtype='flow_tel', resample='day', fun='avg', pivot=True).value opuha_flow.columns = opuha_flow.columns.astype(int) wl = rd_hydrotel(hydrotel_wl_sites, mtype='swl_tel', resample='day', fun='avg', pivot=True).value wl.columns = wl.columns.astype(int) UF = (1.288 * flow[69615] + 0.673 * flow[69616] + 2.438 * flow[69618] - 2.415) UF.name = 1696297 flow = concat([flow, opuha_flow, UF], axis=1) flow.columns = flow.columns.astype(int) ### Create monthly time series of flow restrictions mon_series1 = pd.DataFrame(flow.index.month, index=flow.index, columns=['mon']) mon_series = pd.merge(mon_series1, min_flow_mon, on='mon', how='left') mon_series.index = mon_series1.index ### Run through each band ## Create blank dataframe c1 = min_flow_id.site.tolist() c2 = min_flow_id.allo_band_id.tolist() index1 = pd.MultiIndex.from_tuples(list(zip(*[c1, c2]))) if sum(min_flow_id.site == 69607) > 0: eval_dict = {'flow': flow, 'wl': wl, 'mon_series': mon_series} else: eval_dict = {'flow': flow, 'mon_series': mon_series} allow1 = pd.DataFrame(np.nan, index=flow.index, columns=index1) for j in min_flow_id.index: site_id = min_flow_id.site[j] band_id = min_flow_id.allo_band_id[j] t1 = min_flow_id.loc[j, :] cond_id = literal_eval(t1['cond_id']) cond_id.extend([0]) restr_id = literal_eval(t1['restr_id']) restr_id.extend(['r100']) cond_restr = dict(zip(cond_id, restr_id)) conds1 = min_flow_cond[np.in1d(min_flow_cond.cond_id, cond_id)] norm_conds = conds1[conds1.object != 'other'] other_conds = conds1[conds1.object == 'other'] stmt, ids = stmt_set(norm_conds, other_conds) df1 = pd.concat((eval(x, globals(), eval_dict) for x in stmt), axis=1) df1.columns = ids df2 = df1.copy() df2.loc[:, :] = np.nan perc_restr = {} for x in cond_restr: if cond_restr[x] != 'pro_rata': perc_restr.update({x: eval(min_flow_restr.loc[min_flow_restr.restr_id == cond_restr[x], 'restr_cond'].values[0], globals(), eval_dict)}) else: seta = norm_conds.loc[norm_conds.cond_id == x,:] pr1 = pro_rata(flow[int(seta.site)], float(seta.lower), float(seta.upper)) perc_restr.update({x: pr1}) for i in perc_restr: index = df1[i].dropna().index[np.where(df1[i].dropna())[0]] if type(perc_restr[i]) is pd.Series: df2.ix[index, i] = perc_restr[i][index] else: df2.ix[index, i] = perc_restr[i] ## Take the most restrictive between the conditions df3 = df2.min(axis=1) ### Process exemptions if t1['exempt_id'] is not np.nan: exempt_id = literal_eval(t1['exempt_id']) exempt_restr_id = literal_eval(t1['exempt_restr_id']) exempt_cond_restr = dict(zip(exempt_id, exempt_restr_id)) conds1 = min_flow_cond[np.in1d(min_flow_cond.cond_id, exempt_id)] norm_conds = conds1[conds1.object != 'other'] other_conds = conds1[conds1.object == 'other'] stmt, ids = stmt_set(norm_conds, other_conds) df1 = pd.concat((eval(x, globals(), eval_dict) for x in stmt), axis=1) df1.columns = ids df2 = df1.copy() df2.loc[:, :] = np.nan perc_restr = {x: eval(min_flow_restr.loc[min_flow_restr.restr_id == exempt_cond_restr[x], 'restr_cond'].values[0], globals(), eval_dict) for x in exempt_cond_restr} for i in perc_restr: index = df1[i].dropna().index[np.where(df1[i].dropna())[0]] if type(perc_restr[i]) is pd.Series: df2.ix[index, i] = perc_restr[i][index] else: df2.ix[index, i] = perc_restr[i] ## Take the most restrictive for the exemptions df3_exempt = df2.min(axis=1) ### Take the least restrictive between the primary conditions and the exemptions allow1.loc[:, (site_id, band_id)] = concat([df3, df3_exempt], axis=1).max(axis=1) else: allow1.loc[:, (site_id, band_id)] = df3 ### Constrain results to dates allow2 = allow1[start_date:end_date].round(1) if fill_na: allow2 = allow2.fillna(method='ffill') return(allow2)
def restr_days(select, period='A-JUN', months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], min_sites_shp='S:/Surface Water/shared/GIS_base/vector/low_flows/min_flows_sites_Cant.shp', sites_col='ReferenceN', export=True, export_path='restr_days.csv'): """ Function to determine the number of days on restriction per period according to the LowFlows database. Parameters ---------- select: list or str Can either be a list of gauging site numbers or a shapefile polygon of an area that contains min flow sites. period: str Pandas time series code for the time period. months: list of int The specific months to include in the query. Returns ------- DataFrame """ ######################################## ### Parameters ## Query fields - Be sure to use single quotes for the names!!! restr_fields = ['SiteID', 'RestrictionDate', 'BandNo', 'BandAllocation'] # sites_fields = ['SiteID', 'RefDBaseKey','RecordNo', 'WellNo'] crc_fields = ['SiteID', 'BandNo', 'RecordNo'] sites_fields = ['Siteid', 'RefDBaseKey'] ## Equivelant short names for analyses - Use these names!!! restr_names = ['SiteID', 'dates', 'band_num', 'band_restr'] # sites_names = ['SiteID', 'gauge_num', 'crc', 'wap'] crc_names = ['SiteID', 'band_num', 'crc'] sites_names = ['SiteID', 'gauge_num'] ## Databases #statement = "SELECT * FROM " # daily restrictions server1 = 'SQL2012PROD03' database1 = 'LowFlows' restr_table = 'LowFlows.dbo.LowFlowSiteRestrictionDaily' restr_where = {'SnapshotType': ['Live']} # Sites info server2 = 'SQL2012PROD03' database2 = 'LowFlows' sites_table = 'LowFlows.dbo.vLowFlowSite' # crc, sites, and bands server3 = 'SQL2012PROD03' database3 = 'LowFlows' crc_table = 'LowFlows.dbo.vLowFlowConsents2' ######################################## ## Make the sites selection if isinstance(select, str): if select.endswith('.shp'): sites3 = sel_sites_poly(select, min_sites_shp)[sites_col].unique() else: sites3 = pd.read_csv(select)[sites_col].unique() elif isinstance(select, (list, np.ndarray)): sites3 = select_sites(select) ######################################## ### Read in data sites = rd_sql(server2, database2, sites_table, sites_fields) sites.columns = sites_names sites4 = sites.loc[sites.gauge_num.isin(sites3.astype(str)), 'SiteID'].unique().astype('int32').tolist() restr_where.update({'SiteID': sites4}) restr = rd_sql(server1, database1, restr_table, restr_fields, restr_where).drop_duplicates(keep='last') restr.columns = restr_names crc = rd_sql(server3, database3, crc_table, crc_fields) crc.columns = crc_names ################################## ### Calculate the number of days on full and partial restriction ## Remove anything above 100% restr1 = restr[restr.band_restr <= 100] ## Recategorize band restr partial_index = (restr1.band_restr > 0) & (restr1.band_restr < 100) restr1.loc[partial_index, 'band_restr'] = 101 restr1.loc[restr1.band_restr == 100, 'band_restr'] = 103 restr1.loc[restr1.band_restr == 0, 'band_restr'] = 102 ## Restrict by months mon_index = restr1.dates.dt.month.isin(months) restr1 = restr1[mon_index] ## Do the work def sp_count(df, num): df.index = df.dates df_grp = df[df.band_restr == num].resample(period) df_count = df_grp['band_restr'].count() return df_count restr1_grp = restr1.groupby(['SiteID', 'band_num']) partial1 = restr1_grp.apply(sp_count, 101) partial1.name = 'partial' full1 = restr1_grp.apply(sp_count, 102) full1.name = 'full' # no1 = restr1_grp.apply(sp_count, 103) tot1 = pd.concat([partial1, full1], axis=1) tot1.index.names = ['SiteID', 'band_num', 'dates'] if partial1.empty: tot1['partial'] = 0 if full1.empty: tot1['full'] = 0 # tot1.columns = ['partial', 'full'] tot2 = tot1.reset_index() ## Relabel the sites to actually be site number sites2 = sites.drop_duplicates() tot3 = pd.merge(tot2, sites2, on='SiteID', how='left') tot3.loc[tot3.partial.isnull(), 'partial'] = 0 tot3.loc[tot3.full.isnull(), 'full'] = 0 tot3 = tot3[tot3.gauge_num.notnull()] ## Summarize the results restr2 = tot3[['gauge_num', 'band_num', 'dates', 'partial', 'full']] if export: restr2.to_csv(export_path, index=False) return(restr2)
def rd_squalarc(sites, mtypes=None, from_date=None, to_date=None, convert_dtl=False, dtl_method=None, export=None): """ Function to read in "squalarc" data. Which is atually stored in the mssql db. Parameters ---------- sites: ndarry, list, or str The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest. mtypes: list or None A list of measurement type names to be in the output. Leaving it empty returns all mtypes. from_date: str A start date string in of '2010-01-01'. to_date: str A end date string in of '2011-01-01'. convert_dtl: bool Should values under the detection limit be converted to numeric? dtl_method: str The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit. export: str or None Either None or a string path to a csv file. """ #### Read in sites sites1 = select_sites(sites) #### Extract by polygon if isinstance(sites1, gpd.GeoDataFrame): ## Surface water sites sw_sites_tab = rd_sql('SQL2012PROD05', 'Squalarc', 'SITES', col_names=['SITE_ID', 'NZTMX', 'NZTMY']) sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab) sites1a = sites1.to_crs(gdf_sw_sites.crs) sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry', axis=1) ## Groundwater sites gw_sites_tab = rd_sql('SQL2012PROD05', 'Wells', 'WELL_DETAILS', col_names=['WELL_NO', 'NZTMX', 'NZTMY']) gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY'] gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab) gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry', axis=1) sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist() else: sites2 = pd.Series(sites1, name='site').astype(str).tolist() #### Extract the rest of the data if len(sites2) > 10000: n_chunks = int(np.ceil(len(sites2) * 0.0001)) sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)] samples_tab = pd.DataFrame() for i in sites3: samples_tab1 = rd_sql('SQL2012PROD05', 'Squalarc', '"SQL_SAMPLE_METHODS+"', col_names=[ 'Site_ID', 'SAMPLE_NO', 'ME_TYP', 'Collect_Date', 'Collect_Time', 'PA_NAME', 'PARAM_UNITS', 'SRESULT' ], where_col='Site_ID', where_val=i) samples_tab1.columns = [ 'site', 'sample_id', 'source', 'date', 'time', 'parameter', 'units', 'val' ] samples_tab1.loc[:, 'source'] = samples_tab1.loc[:, 'source'].str.lower( ) samples_tab = pd.concat([samples_tab, samples_tab1]) else: samples_tab = rd_sql('SQL2012PROD05', 'Squalarc', '"SQL_SAMPLE_METHODS+"', col_names=[ 'Site_ID', 'SAMPLE_NO', 'ME_TYP', 'Collect_Date', 'Collect_Time', 'PA_NAME', 'PARAM_UNITS', 'SRESULT' ], where_col='Site_ID', where_val=sites2) samples_tab.columns = [ 'site', 'sample_id', 'source', 'date', 'time', 'parameter', 'units', 'val' ] samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower() samples_tab2 = samples_tab.copy() num_test = pd.to_numeric(samples_tab2.loc[:, 'time'], 'coerce') samples_tab2.loc[num_test.isnull(), 'time'] = '0000' samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('.', '') samples_tab2 = samples_tab2[samples_tab2.date.notnull()] # samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000') time1 = pd.to_datetime(samples_tab2.time, format='%H%M', errors='coerce') time1[time1.isnull()] = pd.Timestamp('2000-01-01 00:00:00') datetime1 = pd.to_datetime( samples_tab2.date.dt.date.astype(str) + ' ' + time1.dt.time.astype(str)) samples_tab2.loc[:, 'date'] = datetime1 samples_tab2 = samples_tab2.drop('time', axis=1) samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = np.nan samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = np.nan #### Select within time range if isinstance(from_date, str): samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date] if isinstance(to_date, str): samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date] if mtypes is not None: mtypes1 = select_sites(mtypes) data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index( drop=True) else: data = samples_tab2.reset_index(drop=True) #### Correct poorly typed in site names data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '') #### Convert detection limit values if convert_dtl: less1 = data['val'].str.match('<') if less1.sum() > 0: less1.loc[less1.isnull()] = False data2 = data.copy() data2.loc[less1, 'val'] = pd.to_numeric( data.loc[less1, 'val'].str.replace('<', ''), errors='coerce') * 0.5 if dtl_method in (None, 'standard'): data3 = data2 if dtl_method == 'trend': df1 = data2.loc[less1] count1 = data.groupby('parameter')['val'].count() count1.name = 'tot_count' count_dtl = df1.groupby('parameter')['val'].count() count_dtl.name = 'dtl_count' count_dtl_val = df1.groupby('parameter')['val'].nunique() count_dtl_val.name = 'dtl_val_count' combo1 = pd.concat([count1, count_dtl, count_dtl_val], axis=1, join='inner') combo1['dtl_ratio'] = (combo1['dtl_count'] / combo1['tot_count']).round(2) ## conditionals # param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)] # under_40 = data['parameter'].isin(param1.index) param2 = combo1[(combo1['dtl_ratio'] > 0.4) & (combo1['dtl_val_count'] != 1)] over_40 = data['parameter'].isin(param2.index) ## Calc detection limit values data3 = pd.merge(data, combo1['dtl_ratio'].reset_index(), on='parameter', how='left') data3.loc[:, 'val_dtl'] = data2['val'] max_dtl_val = data2[over_40 & less1].groupby( 'parameter')['val'].transform('max') max_dtl_val.name = 'dtl_val_max' data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val else: data3 = data else: data3 = data #### Return and export if isinstance(export, str): data3.to_csv(export, encoding='utf-8', index=False) return data3
def rd_henry(sites, from_date=None, to_date=None, agg_day=True, sites_by_col=False, min_filter=None, export=None): """ Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer. Parameters ---------- sites: list or str Either a list of site names or a file path string that contains a column of site names. from_date: str A date string for the start of the data (e.g. '2010-01-01'). to_date: str A date string for the end of the data. agg_day: bool Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean. sites_by_col: bool 'False' does not make a single DateTimeIndex, rather it is indexed by site and date (long format). 'True' creates a single DateTimeIndex with the columns as gauging sites (will create many NAs). min_filter: int or None Minimum number of days required for the gaugings output. export: str or None Either a string path to a csv file or None. """ def resample1(df): df.index = df.date df2 = df.resample('D').mean() return df2 #### Fields and names for databases ## Query fields - Be sure to use single quotes for the names!!! fields = ['SiteNo', 'SampleDate', 'Flow'] ## Equivelant short names for analyses - Use these names!!! names = ['site', 'date', 'flow'] #### Databases ### Gaugings data server = 'SQL2012PROD03' database = 'DataWarehouse' table = 'DataWarehouse.dbo.F_SG_BGauging' where_col = 'SiteNo' ## Will change to the following!!! Or stay as a duplicate... # database1 = 'Hydstra' # table1 = 'Hydstra.dbo.GAUGINGS' ######################################## ### Read in data sites1 = select_sites(sites).tolist() data = rd_sql(server=server, database=database, table=table, col_names=fields, where_col=where_col, where_val=sites1).dropna() data.columns = names ### Aggregate duplicates data2 = data.groupby(['site', 'date']).mean().reset_index() ### Aggregate by day if agg_day: data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna() else: data3 = data2 ### Filter out sites with less than min_filter if min_filter is not None: count1 = data3.groupby('site')['flow'].count() count_index = count1[count1 >= min_filter].index data3 = data3[np.in1d(data3.site.values, count_index)] ### Select within date range if from_date is not None: data3 = data3[data3.date >= from_date] if to_date is not None: data3 = data3[data3.date <= to_date] ### reorganize data with sites as columns and dates as index if sites_by_col: data4 = data3.pivot(index='date', columns='site').xs('flow', axis=1).round(4) else: data4 = data3.round(4) if isinstance(export, str): if sites_by_col: data4.to_csv(export) else: data4.to_csv(export, index=False) return data4
def rd_hydrotel(sites, hydro_id, from_date=None, to_date=None, resample_code='D', period=1, val_round=3, min_count=None, pivot=False, export_path=None): """ Function to extract time series data from the hydrotel database. Parameters ---------- sites: list, array, dataframe, or str Site list or a str path to a single column csv file of site names/numbers. hydro_id: str 'river / flow / rec / raw', 'aq / wl / rec / raw', 'atmos / precip / rec / raw', 'river / wl / rec / raw', or 'river / T / rec / raw'. from_date: str or None The start date in the format '2000-01-01'. to_date: str or None The end date in the format '2000-01-01'. resample_code : str The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc. period: int The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period. fun: str The resampling function. i.e. mean, sum, count, min, or max. No median yet... val_round: int The number of decimals to round the values. pivot: bool Should the output be pivotted into wide format? export_path: str or None The path and file name to be saved. Returns ------- Series or DataFrame A MultiIndex Pandas Series if pivot is False and a DataFrame if True """ #### Import data and select the correct sites sites = select_sites(sites) if hydro_id == 'atmos / precip / rec / raw': site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'], 'ExtSysId', sites.astype('int32').tolist()) site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'], 'Site', site_ob1.Site.tolist()) site_val1 = pd.merge(site_val0, site_ob1, on='Site') elif hydro_id in ['aq / wl / rec / raw', 'aq / T / rec / raw']: site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name']) site_val0.loc[:, 'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0], axis=1) site_val1 = site_val0[site_val0.Name.isin(sites)].copy() site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name'] else: site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId', sites.astype('int32').tolist()) if site_val1.empty: raise ValueError('No site(s) in database') site_val1.loc[:, 'ExtSysId'] = pd.to_numeric(site_val1.loc[:, 'ExtSysId'], errors='ignore') site_val1 = site_val1.drop_duplicates('ExtSysId') site_val = site_val1.Site.astype('int32').tolist() if isinstance(hydro_id, (list, np.ndarray, pd.Series)): hydro_ids = [hydro_ids_dict[i] for i in hydro_id] elif isinstance(hydro_id, str): hydro_ids = [hydro_ids_dict[hydro_id]] else: raise ValueError('hydro_id must be a str, list, ndarray, or Series.') hydro_ids_val = rd_sql(server, database, hydro_ids_tab, hydro_ids_col, 'Name', hydro_ids) where_col = { 'Site': site_val, 'ObjectVariant': hydro_ids_val.ObjectVariant.astype('int32').tolist(), 'ObjectType': hydro_ids_val.ObjectType.astype('int32').tolist() } object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col) if hydro_id == 'aq / wl / rec / raw': object_val1 = object_val1[object_val1.Name == 'Water Level'] elif hydro_id == 'atmos / precip / rec / raw': object_val1 = object_val1[object_val1.Name == 'Rainfall'] elif hydro_id == 'river / T / rec / raw': object_val1 = object_val1[object_val1.Name == 'Water Temperature'] object_val = object_val1.Object.values.astype(int).tolist() #### Rearrange data point_val1 = rd_sql(server, database, points_tab, points_col, where_col='Object', where_val=object_val) point_val = point_val1.Point.values.astype(int).tolist() #### Big merge comp_tab1 = pd.merge(site_val1, object_val1[['Object', 'Site']], on='Site') comp_tab2 = pd.merge(comp_tab1, point_val1, on='Object') comp_tab2.set_index('Point', inplace=True) #### Pull out the data ### Make SQL statement data1 = rd_sql_ts(server, database, data_tab, 'Point', 'DT', 'SampleValue', resample_code, period, resample_dict[hydro_id], val_round, {'Point': point_val}, from_date=from_date, to_date=to_date, min_count=min_count)['SampleValue'] data1.index.names = ['site', 'time'] data1.name = 'value' site_numbers = [ comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0] ] data1.index.set_levels(site_numbers, level='site', inplace=True) if pivot: data3 = data1.unstack(0) else: data3 = data1 #### Export and return if export_path is not None: save_df(data3, export_path) return data3
def stream_nat( sites, catch_shp=r'S:\Surface Water\shared\GIS_base\vector\catchments\catch_delin_recorders.shp', include_gw=True, max_date='2015-06-30', sd_hdf='S:/Surface Water/shared/base_data/usage/sd_est_all_mon_vol.h5', flow_csv=None, crc_shp=r'S:\Surface Water\shared\GIS_base\vector\allocations\allo_gis.shp', catch_col='site', pivot=False, return_data=False, export_path=None): """ Function to naturalize stream flows from monthly sums of usage. Parameters ---------- sites: list, ndarray, Series A list of recorder sites to be naturalised. catch_shp: str A shapefile of the delineated catchments for all recorders. include_gw: bool Should stream depleting GW takes be included? max_date: str The last date to be naturalised. In the form of '2015-06-30'. sd_hdf: str The hdf file of all the crc/waps with estimated usage and allocation. flow_csv: str or None If None, then use the hydro class to import the data. Otherwise, flow data can be imported as a csv file with the first column as datetime and each other column as a recorder site in m3/s. It can also be a dataframe. crc_shp: str A shapefile of all of th locations of the crc/waps. pivot: bool Should the output be pivotted? return_data: bool Should the allocation/usage time series be returned? export_path: str or None Path to save results as either hdf or csv (or None). Returns ------- DataFrame """ qual_codes = [10, 18, 20, 50] ### Read in data ## Site numbers sites1 = select_sites(sites) ## Stream depletion sd = pd.read_hdf(sd_hdf) sd.time = pd.to_datetime(sd.time) if include_gw: sd1 = sd[sd.time <= max_date] else: sd1 = sd[(sd.take_type == 'Take Surface Water') & (sd.time <= max_date)] ## Recorder flow if type(flow_csv) is str: flow = rd_ts(flow_csv) flow.columns = flow.columns.astype(int) flow.index.name = 'time' flow.columns.name = 'site' flow = flow.stack() flow.name = 'flow' flow.index = flow.index.reorder_levels(['site', 'time']) flow = flow.sort_index() elif isinstance(flow_csv, pd.DataFrame): flow = flow_csv.copy() flow.columns = flow.columns.astype(int) flow.index.name = 'time' flow.columns.name = 'site' flow = flow.stack() flow.name = 'flow' flow.index = flow.index.reorder_levels(['site', 'time']) flow = flow.sort_index() elif isinstance(flow_csv, pd.Series): flow = flow_csv.copy() else: raise ValueError('Pass something useful to flow_csv.') ## crc shp crc_loc = gpd.read_file(crc_shp) crc_loc1 = pd.merge( crc_loc[[ 'crc', 'take_type', 'allo_block', 'wap', 'use_type', 'geometry' ]], sd[['crc', 'take_type', 'allo_block', 'wap', 'use_type']].drop_duplicates(), on=['crc', 'take_type', 'allo_block', 'wap', 'use_type']) ## Catchment areas shp catch = gpd.read_file(catch_shp).drop('NZREACH', axis=1) catch = catch[catch[catch_col].isin(sites1)] ### Spatial processing of WAPs, catchments, and sites ## WAPs to catchments sjoin crc_catch, catch2 = pts_poly_join(crc_loc1, catch, catch_col) # id_areas = catch2.area.copy() # tot_areas = catch2.area.copy() # # ## Unique catchments/gauges ## sites = wap_catch[catch_col].unique() # sites2 = catch[catch_col].unique() ### Next data import ## Gaugings # gaugings = rd_henry(sites=sites.astype('int32'), agg_day=True, sites_by_col=True) # gaugings.columns = gaugings.columns.astype(int) ## site specific flow # rec_sites = flow.columns[in1d(flow.columns, sites)] # gauge_sites = sites[~in1d(sites, rec_sites)] # gauge_sites2 = gaugings.columns[in1d(gaugings.columns, gauge_sites)] # site_flow = flow[rec_sites] # gaugings = gaugings[gauge_sites2] ### filter down the sites sd1a = pd.merge(crc_catch, sd1, on=['crc', 'take_type', 'allo_block', 'wap', 'use_type']).drop('geometry', axis=1) ### Remove excessive usages sd1a = sd1a[~((sd1a.sd_usage / sd1a.ann_restr_allo_m3 / 12) >= 1.5)] ### Calc SD for site and month sd2 = sd1a.groupby(['site', 'time'])['sd_usage'].sum().reset_index() days1 = sd2.time.dt.daysinmonth sd2['sd_rate'] = sd2.sd_usage / days1 / 24 / 60 / 60 ### Resample SD to daily time series days2 = pd.to_timedelta((days1 / 2).round().astype('int32'), unit='D') sd3 = sd2.drop('sd_usage', axis=1) sd3.loc[:, 'time'] = sd3.loc[:, 'time'] - days2 grp1 = sd3.groupby(['site']) first1 = grp1.first() last1 = sd2.groupby('site')[['time', 'sd_rate']].last() first1.loc[:, 'time'] = pd.to_datetime( first1.loc[:, 'time'].dt.strftime('%Y-%m') + '-01') sd4 = pd.concat([first1.reset_index(), sd3, last1.reset_index() ]).reset_index(drop=True).sort_values(['site', 'time']) sd5 = sd4.set_index('time') sd6 = sd5.groupby('site').apply( lambda x: x.resample('D').interpolate(method='pchip'))['sd_rate'] ### Naturalise flows nat1 = pd.concat([flow, sd6], axis=1, join='inner') nat1['nat_flow'] = nat1['flow'] + nat1['sd_rate'] ## Normalize to area if desired # if norm_area: # # recorder flow in mm/day # site_order = tot_areas[flow1.columns].values / 60 / 60 / 24 / 1000 # flow_norm = flow1.div(site_order) # nat_flow_norm = nat_flow.div(site_order) # # # Gauges flow in mm/day # site_order = tot_areas[gaugings1.columns].values / 60 / 60 / 24 / 1000 # gaugings_norm = gaugings1.div(site_order) # nat_gauge_norm = nat_gauge.div(site_order) # # ### Export and return results # if export: # nat_flow_norm.to_csv(export_rec_flow_path) # nat_gauge_norm.to_csv(export_gauge_flow_path) # return([flow_norm, gaugings_norm, nat_flow_norm, nat_gauge_norm]) # else: # if export: # nat_flow.to_csv(export_rec_flow_path) # nat_gauge.to_csv(export_gauge_flow_path) # return([flow1, gaugings1, nat_flow, nat_gauge]) if pivot: nat2 = nat1.round(3).unstack('site') else: nat2 = nat1.round(3) if isinstance(export_path, str): save_df(nat2, export_path) if return_data: return nat2, sd1a else: return nat2
def rec_catch_del(sites_shp, rec_streams_shp, rec_catch_shp, sites_col='site', buffer_dis=400, catch_output=None): """ Catchment delineation using the REC streams and catchments. Parameters ---------- sites_shp : str path or GeoDataFrame Points shapfile of the sites along the streams or the equivelant GeoDataFrame. rec_streams_shp : str path, GeoDataFrame, or dict str path to the REC streams shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function. rec_catch_shp : str path, GeoDataFrame, or dict str path to the REC catchment shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function. sites_col : str The column name of the site numbers in the sites_shp. catch_output : str or None The output polygon shapefile path of the catchment delineation. Returns ------- GeoDataFrame Polygons """ ### Parameters ### Modifications {NZREACH: {NZTNODE/NZFNODE: node # to change}} mods = {13053151: {'NZTNODE': 13055874}, 13048353: {'NZTNODE': 13048851}, 13048498: {'NZTNODE': 13048851}} ### Load data if isinstance(rec_catch_shp, gpd.GeoDataFrame): rec_catch = rec_catch_shp.copy() elif isinstance(rec_catch_shp, str): if rec_catch_shp.endswith('shp'): rec_catch = gpd.read_file(rec_catch_shp) else: raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.') elif isinstance(rec_catch_shp, dict): rec_catch = rd_sql(**rec_catch_shp) if isinstance(rec_streams_shp, gpd.GeoDataFrame): rec_streams = rec_streams_shp.copy() elif isinstance(rec_streams_shp, str): if rec_streams_shp.endswith('shp'): rec_streams = gpd.read_file(rec_streams_shp) else: raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.') elif isinstance(rec_streams_shp, dict): rec_streams = rd_sql(**rec_streams_shp) pts = select_sites(sites_shp) ### make mods for i in mods: rec_streams.loc[rec_streams['NZREACH'] == i, mods[i].keys()] = mods[i].values() ### Find closest REC segment to points pts_seg = closest_line_to_pts(pts, rec_streams, line_site_col='NZREACH', buffer_dis=buffer_dis) nzreach = pts_seg.copy().NZREACH.unique() ### Find all upstream reaches reaches = find_upstream_rec(nzreach, rec_streams_shp=rec_streams) ### Extract associated catchments rec_catch = extract_rec_catch(reaches, rec_catch_shp=rec_catch) ### Aggregate individual catchments rec_shed = agg_rec_catch(rec_catch) rec_shed.columns = ['NZREACH', 'geometry', 'area'] rec_shed1 = rec_shed.merge(pts_seg.drop('geometry', axis=1), on='NZREACH') ### Export and return if catch_output is not None: rec_shed1.to_file(catch_output) return rec_shed1