コード例 #1
0
ファイル: hydllp.py プロジェクト: mullenkamp/HydroPandas
def rd_blocklist(sites, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01'):
    """
    Wrapper function to extract info about when data has changed between modification dates.

    Parameters
    ----------
    sites : list, array, one column csv file, or dataframe
        Site numbers.
    datasource : list of str
        Hydstra datasource code (usually ['A']).
    variables : list of int or float
        The hydstra conversion data variable (140.00 is flow).
    start : str
        The start time in the format of '2001-01-01'.
    end : str
        Same formatting as start.
    start_modified: str
        The starting date of the modification.
    end_modified: str
        The ending date of the modification.

    Returns
    -------
    DataFrame
        With site, data_source, varto, from_mod_date, and to_mod_date.
    """
    ### Process sites
    sites1 = select_sites(sites).tolist()

    ### Open connection
    hyd = openHyDb()
    with hyd as h:
        df = h.get_ts_blockinfo(sites1, start=start, end=end, datasources=datasources, variables=variables, start_modified=start_modified, end_modified=end_modified)
    return df
コード例 #2
0
ファイル: flow.py プロジェクト: mullenkamp/HydroPandas
def hydstra_site_mod_time(sites=None):
    """
    Function to extract modification times from Hydstra data archive files. Returns a DataFrame of sites by modification date. The modification date is in GMT.

    Parameters
    ----------
    sites : list, array, Series, or None
        If sites is not None, then return only the given sites.

    Returns
    -------
    DataFrame
    """

    site_files_path = r'\\fileservices02\ManagedShares\Data\Hydstra\prod\hyd\dat\hyd'
    files1 = rd_dir(site_files_path, 'A')
    file_sites = [path.splitext(i)[0] for i in files1]

    if sites is not None:
        sites1 = select_sites(sites).astype(str)
        sites2 = [i.replace('/', '_') for i in sites1]
        file_sites1 = [i for i in file_sites if i in sites2]
    else:
        file_sites1 = file_sites

    mod_times = to_datetime([round(path.getmtime(path.join(site_files_path, i + '.A'))) for i in file_sites1], unit='s')

    df = DataFrame({'site': file_sites1, 'mod_time': mod_times})
    return df
コード例 #3
0
ファイル: hydllp.py プロジェクト: mullenkamp/HydroPandas
    def get_ts_traces(self, site_list, start=0, end=0, varfrom=100, varto=140, interval='day', multiplier=1, datasource='A', data_type='mean', qual_codes=[30, 20, 10, 11, 21, 18], report_time=None):
        """

        """

        # Convert the site list to a comma delimited string of sites
        sites = select_sites(site_list).astype(str)
        site_list_str = ','.join([str(site) for site in sites])

        ### Datetime conversion - with dates < 1900
        c1900 = Timestamp('1900-01-01')
        if start != 0:
            start1 = Timestamp(start)
            if start1 > c1900:
                start = start1.strftime('%Y%m%d%H%M%S')
            else:
                start = start1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '')
        if end != 0:
            end1 = Timestamp(end)
            if end1 > c1900:
                end = end1.strftime('%Y%m%d%H%M%S')
            else:
                end = end1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '')

        ts_traces_request = {'function': 'get_ts_traces',
                             'version': 2,
                             'params': {'site_list': site_list_str,
                                        'start_time': start,
                                        'end_time': end,
                                        'varfrom': varfrom,
                                        'varto': varto,
                                        'interval': interval,
                                        'datasource': datasource,
                                        'data_type': data_type,
                                        'multiplier': multiplier,
                                        'report_time': report_time}}

        ts_traces_request = self.query_by_dict(ts_traces_request)
        j1 = ts_traces_request['return']['traces']

        ### Convert json to a dataframe
        sites = [str(f['site']) for f in j1]

        out1 = DataFrame()
        for i in range(len(j1)):
            df1 = DataFrame(j1[i]['trace'])
            if not df1.empty:
                df1.rename(columns={'v': 'data', 't': 'time', 'q': 'qual_code'}, inplace=True)
                df1['data'] = to_numeric(df1['data'], errors='coerce')
                df1['time'] = to_datetime(df1['time'], format='%Y%m%d%H%M%S')
                df1['qual_code'] = to_numeric(df1['qual_code'], errors='coerce', downcast='integer')
                df1['site'] = sites[i]
                df2 = df1[df1.qual_code.isin(qual_codes)]
                out1 = concat([out1, df2])

        out2 = out1.set_index(['site', 'time'])[['data', 'qual_code']]

        return out2
コード例 #4
0
ファイル: vector.py プロジェクト: mullenkamp/HydroPandas
def flow_sites_to_shp(sites='All',
                      min_flow_only=False,
                      export=False,
                      export_path='sites.shp'):
    """
    Function to create a geopandas/shapefile from flow sites.
    """

    ### Import from databases
    if min_flow_only:
        min_flow_sites = rd_sql('SQL2012PROD05',
                                'Wells',
                                '"vMinimumFlowSites+Consent+Well_classes"',
                                col_names=[
                                    'RefDbase', 'RefDbaseKey',
                                    'restrictionType', 'RecordNo', 'WellNo'
                                ],
                                where_col='RefDbase',
                                where_val=['Gauging', 'Hydrotel'])
        min_flow_sites.columns = ['type', 'site', 'restr', 'crc', 'wap']
        min_flow_sites['site'] = min_flow_sites['site'].astype(int)
        min_flow_sites = min_flow_sites[min_flow_sites.restr == 'LowFlow']

    site_geo = rd_sql('SQL2012PROD05',
                      'GIS',
                      'vGAUGING_NZTM',
                      col_names=['SiteNumber', 'RIVER', 'SITENAME'],
                      geo_col=True)
    site_geo.columns = ['site', 'river', 'site_name', 'geometry']
    site_geo['river'] = site_geo.river.apply(lambda x: x.title())
    site_geo['site_name'] = site_geo.site_name.apply(lambda x: x.title())
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace(' (Recorder)', ''))
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace('Sh', 'SH'))
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace('Ecs', 'ECS'))

    ### Select sites
    if type(sites) is str:
        if sites is 'All':
            sites_sel_geo = site_geo
        elif sites.endswith('.shp'):
            poly = read_file(sites)
            sites_sel_geo = sel_sites_poly(poly, site_geo)
        else:
            raise ValueError('If sites is a str, then it must be a shapefile.')
    else:
        sites_sel = select_sites(sites).astype('int32')
        sites_sel_geo = site_geo[in1d(site_geo.site, sites_sel)]
    if min_flow_only:
        sites_sel_geo = sites_sel_geo[in1d(sites_sel_geo.site,
                                           min_flow_sites.site.values)]

    ### Export and return
    if export:
        sites_sel_geo.to_file(export_path)
    return (sites_sel_geo)
コード例 #5
0
ファイル: hydllp.py プロジェクト: mullenkamp/HydroPandas
def rd_hydstra_db(sites, start=0, end=0, datasource='A', data_type='mean', varfrom=100, varto=140, interval='day', multiplier=1, qual_codes=[30, 20, 10, 11, 21, 18], report_time=None, sites_chunk=20, print_sites=False, export_path=None):
    """
    Wrapper function over hydllp to read in data from Hydstra's database. Must be run in a 32bit python. If either start_time or end_time is not 0, then they both need a date.

    Parameters
    ----------
    sites : list, array, one column csv file, or dataframe
        Site numbers.
    start : str or int of 0
        The start time in the format of either '2001-01-01' or 0 (for all data).
    end : str or int of 0
        Same formatting as start.
    datasource : str
        Hydstra datasource code (usually 'A').
    data_type : str
        mean, maxmin, max, min, start, end, first, last, tot, point, partialtot, or cum.
    varfrom : int or float
        The hydstra source data variable (100.00 is water level).
    varto : int or float
        The hydstra conversion data variable (140.00 is flow).
    interval : str
        The frequency of the output data (year, month, day, hour, minute, second, period). If data_type is 'point', then interval cannot be 'period' (use anything else, it doesn't matter).
    multiplier : int
        interval frequency.
    qual_codes : list of int
        The quality codes in Hydstra for filtering the data.
    sites_chunk : int
        Number of sites to request to hydllp at one time. Do not change unless you understand what it does.

    Return
    ------
    DataFrame
        In long format with site and time as a MultiIndex.
    """

    ### Process sites into workable chunks
    sites1 = select_sites(sites)
    n_chunks = ceil(len(sites1) / float(sites_chunk))
    sites2 = array_split(sites1, n_chunks)

    ### Run instance of hydllp
    data = DataFrame()
    for i in sites2:
        if print_sites:
            print(i)
        ### Open connection
        hyd = openHyDb()
        with hyd as h:
            df = h.get_ts_traces(i, start=start, end=end, datasource=datasource, data_type=data_type, varfrom=varfrom, varto=varto, interval=interval, multiplier=multiplier, qual_codes=qual_codes, report_time=report_time)
        data = concat([data, df])

    if isinstance(export_path, str):
        save_df(data, export_path)

    return (data)
コード例 #6
0
ファイル: vector.py プロジェクト: mullenkamp/HydroPandas
def xy_to_gpd(id_col, x_col, y_col, df=None, crs=2193):
    """
    Function to convert a DataFrame with x and y coordinates to a GeoDataFrame.

    Arguments:\n
    df -- Dataframe
    id_col -- the column(s) from the dataframe to be returned. Either a one name string or a list of column names.\n
    xcol -- Either the column name that has the x values within the df or an array of x values.\n
    ycol -- Same as xcol.\n
    crs -- The projection of the data.
    """

    if type(x_col) is str:
        geometry = [Point(xy) for xy in zip(df[x_col], df[y_col])]
    else:
        x1 = select_sites(x_col)
        y1 = select_sites(y_col)
        geometry = [Point(xy) for xy in zip(x1, y1)]
    if isinstance(id_col, str) & (df is not None):
        id_data = df[id_col]
    elif isinstance(id_col, list):
        if df is not None:
            id_data = df[id_col]
        else:
            id_data = id_col
    elif isinstance(id_col, (ndarray, Series, Index)):
        id_data = id_col
    else:
        raise ValueError('id_data could not be determined')
    if isinstance(crs, int):
        crs1 = convert_crs(crs)
    elif isinstance(crs, (str, dict)):
        crs1 = crs
    else:
        raise ValueError('crs must be an int, str, or dict')
    gpd = GeoDataFrame(id_data, geometry=geometry, crs=crs1)
    return (gpd)
コード例 #7
0
ファイル: hydllp.py プロジェクト: mullenkamp/HydroPandas
    def get_ts_blockinfo(self, site_list, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01', fill_gaps=0, auditinfo=0):
        """

        """

        # Convert the site list to a comma delimited string of sites
        sites = select_sites(site_list).astype(str)
        site_list_str = ','.join([str(site) for site in sites])

        ### Datetime conversion
        start = Timestamp(start).strftime('%Y%m%d%H%M%S')
        end = Timestamp(end).strftime('%Y%m%d%H%M%S')
        start_modified = Timestamp(start_modified).strftime('%Y%m%d%H%M%S')
        end_modified = Timestamp(end_modified).strftime('%Y%m%d%H%M%S')

        ### dict request
        ts_blockinfo_request = {"function": "get_ts_blockinfo",
                                "version": 2,
                                "params": {'site_list': site_list_str,
                                           'datasources': datasources,
                                           'variables': variables,
                                           'starttime': start,
                                           'endtime': end,
                                           'start_modified': start_modified,
                                           'end_modified': end_modified
                                           }}

        ts_blockinfo_result = self.query_by_dict(ts_blockinfo_request)
        blocks = ts_blockinfo_result['return']['blocks']
        df1 = DataFrame(blocks)
        if df1.empty:
            return(df1)
        else:
            df1['endtime'] = to_datetime(df1['endtime'], format='%Y%m%d%H%M%S')
            df1['starttime'] = to_datetime(df1['starttime'], format='%Y%m%d%H%M%S')
            df1['variable'] = to_numeric(df1['variable'], errors='coerce', downcast='integer')
            df2 = df1[['site', 'datasource', 'variable', 'starttime', 'endtime']].sort_values(['site', 'variable', 'starttime'])
            df2.rename(columns={'datasource': 'data_source', 'variable': 'varto', 'starttime': 'from_mod_date', 'endtime': 'to_mod_date'}, inplace=True)

            return df2
コード例 #8
0
def stream_nat(
        sites,
        catch_shp=r'P:\cant_catch_delin\recorders\catch_del.shp',
        include_gw=True,
        max_date='2015-06-30',
        sd_hdf='S:/Surface Water/shared/base_data/usage/sd_est_all_mon_vol.h5',
        flow_csv=None,
        crc_shp=r'S:\Surface Water\shared\GIS_base\vector\allocations\allo_gis.shp',
        catch_col='site',
        pivot=False,
        return_data=False,
        export_path=None):
    """
    Function to naturalize stream flows from monthly sums of usage.

    sites -- A list of recorder sites to be naturalised.\n
    catch_shp -- A shapefile of the delineated catchments for all recorders.\n
    include_gw -- Should stream depleting GW takes be included?\n
    max_date -- The last date to be naturalised. In the form of '2015-06-30'.\n
    sd_hdf -- The hdf file of all the crc/waps with estimated usage and allocation.\n
    flow_csv -- If None, then use the hydro class to import the data. Otherwise, flow data can be imported as a csv file with the first column as datetime and each other column as a recorder site in m3/s. It can also be a dataframe.\n
    crc_shp -- A shapefile of all of th locations of the crc/waps.\n
    pivot -- Should the output be pivotted?\n
    return_data -- Should the allocation/usage time series be returned?
    """

    qual_codes = [10, 18, 20, 50]

    ### Read in data
    ## Site numbers
    sites1 = select_sites(sites)

    ## Stream depletion
    sd = read_hdf(sd_hdf)
    if include_gw:
        sd1 = sd[sd.time <= max_date]
    else:
        sd1 = sd[(sd.take_type == 'Take Surface Water')
                 & (sd.time <= max_date)]

    ## Recorder flow
    if type(flow_csv) is str:
        flow = rd_ts(flow_csv)
        flow.columns = flow.columns.astype(int)
        flow.index.name = 'time'
        flow.columns.name = 'site'
        flow = flow.stack()
        flow.name = 'flow'
        flow.index = flow.index.reorder_levels(['site', 'time'])
        flow = flow.sort_index()
    elif isinstance(flow_csv, DataFrame):
        flow = flow_csv.copy()
        flow.columns = flow.columns.astype(int)
        flow.index.name = 'time'
        flow.columns.name = 'site'
        flow = flow.stack()
        flow.name = 'flow'
        flow.index = flow.index.reorder_levels(['site', 'time'])
        flow = flow.sort_index()
    elif flow_csv is None:
        flow = hydro().get_data(mtypes='flow',
                                sites=sites1,
                                qual_codes=qual_codes)
        sites1 = flow.sites
        flow = flow.data
        flow.index = flow.index.droplevel('mtype')
        flow.name = 'flow'
    else:
        raise ValueError('Pass something useful to flow_csv.')

    ## crc shp
    crc_loc = read_file(crc_shp)
    crc_loc1 = merge(crc_loc[[
        'crc', 'take_type', 'allo_block', 'wap', 'use_type', 'geometry'
    ]],
                     sd[['crc', 'take_type', 'allo_block', 'wap',
                         'use_type']].drop_duplicates(),
                     on=['crc', 'take_type', 'allo_block', 'wap', 'use_type'])

    ## Catchment areas shp
    catch = read_file(catch_shp).drop('NZREACH', axis=1)
    catch = catch[catch[catch_col].isin(sites1)]

    ### Spatial processing of WAPs, catchments, and sites
    ## WAPs to catchments sjoin
    crc_catch, catch2 = pts_poly_join(crc_loc1, catch, catch_col)

    #    id_areas = catch2.area.copy()
    #    tot_areas = catch2.area.copy()
    #
    #    ## Unique catchments/gauges
    ##    sites = wap_catch[catch_col].unique()
    #    sites2 = catch[catch_col].unique()

    ### Next data import
    ## Gaugings
    #    gaugings = rd_henry(sites=sites.astype('int32'), agg_day=True, sites_by_col=True)
    #    gaugings.columns = gaugings.columns.astype(int)

    ## site specific flow
    #    rec_sites = flow.columns[in1d(flow.columns, sites)]
    #    gauge_sites = sites[~in1d(sites, rec_sites)]
    #    gauge_sites2 = gaugings.columns[in1d(gaugings.columns, gauge_sites)]
    #    site_flow = flow[rec_sites]
    #    gaugings = gaugings[gauge_sites2]

    ### filter down the sites
    sd1a = merge(crc_catch,
                 sd1,
                 on=['crc', 'take_type', 'allo_block', 'wap',
                     'use_type']).drop('geometry', axis=1)

    ### Remove excessive usages
    sd1a = sd1a[~((sd1a.sd_usage / sd1a.ann_restr_allo_m3 / 12) >= 1.5)]

    ### Calc SD for site and month
    sd2 = sd1a.groupby(['site', 'time'])['sd_usage'].sum().reset_index()
    days1 = sd2.time.dt.daysinmonth
    sd2['sd_rate'] = sd2.sd_usage / days1 / 24 / 60 / 60

    ### Resample SD to daily time series
    days2 = to_timedelta((days1 / 2).round().astype('int32'), unit='D')
    sd3 = sd2.drop('sd_usage', axis=1)
    sd3.loc[:, 'time'] = sd3.loc[:, 'time'] - days2
    grp1 = sd3.groupby(['site'])
    first1 = grp1.first()
    last1 = sd2.groupby('site')[['time', 'sd_rate']].last()
    first1.loc[:,
               'time'] = to_datetime(first1.loc[:,
                                                'time'].dt.strftime('%Y-%m') +
                                     '-01')
    sd4 = concat([first1.reset_index(), sd3,
                  last1.reset_index()
                  ]).reset_index(drop=True).sort_values(['site', 'time'])
    sd5 = sd4.set_index('time')
    sd6 = sd5.groupby('site').apply(
        lambda x: x.resample('D').interpolate(method='pchip'))['sd_rate']

    ### Naturalise flows
    nat1 = concat([flow, sd6], axis=1, join='inner')
    nat1['nat_flow'] = nat1['flow'] + nat1['sd_rate']

    ## Normalize to area if desired
    #    if norm_area:
    #        # recorder flow in mm/day
    #        site_order = tot_areas[flow1.columns].values / 60 / 60 / 24 / 1000
    #        flow_norm = flow1.div(site_order)
    #        nat_flow_norm = nat_flow.div(site_order)
    #
    #        # Gauges flow in mm/day
    #        site_order = tot_areas[gaugings1.columns].values / 60 / 60 / 24 / 1000
    #        gaugings_norm = gaugings1.div(site_order)
    #        nat_gauge_norm = nat_gauge.div(site_order)
    #
    #        ### Export and return results
    #        if export:
    #            nat_flow_norm.to_csv(export_rec_flow_path)
    #            nat_gauge_norm.to_csv(export_gauge_flow_path)
    #        return([flow_norm, gaugings_norm, nat_flow_norm, nat_gauge_norm])
    #    else:
    #        if export:
    #            nat_flow.to_csv(export_rec_flow_path)
    #            nat_gauge.to_csv(export_gauge_flow_path)
    #        return([flow1, gaugings1, nat_flow, nat_gauge])
    if pivot:
        nat2 = nat1.round(3).unstack('site')
    else:
        nat2 = nat1.round(3)
    if isinstance(export_path, str):
        save_df(nat2, export_path)
    if return_data:
        return (nat2, sd1a)
    else:
        return (nat2)
コード例 #9
0
ファイル: flow.py プロジェクト: mullenkamp/HydroPandas
def rd_henry(sites, from_date=None, to_date=None, agg_day=True, sites_by_col=False, min_filter=None, export=False,
             export_path='gauge_flows.csv'):
    """
    Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer.

    Arguments:\n
    sites -- Either a list of site names or a file path string that contains a column of site names.\n
    sites_col -- If 'sites' is a path string, then the column that contains site names.\n
    from_date -- A date string for the start of the data (e.g. '2010-01-01').\n
    to_date -- A date string for the end of the data.\n
    agg_day -- Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean.\n
    sites_by_col -- 'False' does not make a time series, rather it is organized by site, date, and gauging. 'True' creates a time series with the columns as gauging sites (will create many NAs).\n
    min_filter -- Minimum number of days required for the gaugings output.
    """

    def resample1(df):
        df.index = df.date
        df2 = df.resample('D').mean()
        return (df2)

    #### Fields and names for databases

    ## Query fields - Be sure to use single quotes for the names!!!

    fields = ['SiteNo', 'SampleDate', 'Flow']

    ## Equivelant short names for analyses - Use these names!!!

    names = ['site', 'date', 'flow']

    #### Databases

    ### Gaugings data

    server = 'SQL2012PROD03'
    database = 'DataWarehouse'

    table = 'DataWarehouse.dbo.F_SG_BGauging'
    where_col = 'SiteNo'

    ## Will change to the following!!! Or stay as a duplicate...

    # database1 = 'Hydstra'

    # table1 = 'Hydstra.dbo.GAUGINGS'

    ########################################
    ### Read in data

    sites1 = select_sites(sites).tolist()
    data = rd_sql(server=server, database=database, table=table, col_names=fields, where_col=where_col,
                  where_val=sites1).dropna()
    data.columns = names

    ### Aggregate duplicates

    data2 = data.groupby(['site', 'date']).mean().reset_index()

    ### Aggregate by day

    if agg_day:
        data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna()
    else:
        data3 = data2

    ### Filter out sites with less than min_filter
    if min_filter is not None:
        count1 = data3.groupby('site')['flow'].count()
        count_index = count1[count1 >= min_filter].index
        data3 = data3[in1d(data3.site.values, count_index)]

    ### Select within date range
    if from_date is not None:
        data3 = data3[data3.date >= from_date]
    if to_date is not None:
        data3 = data3[data3.date <= to_date]

    ### reorganize data with sites as columns and dates as index

    if sites_by_col:
        data4 = data3.pivot(index='date', columns='site').xs('flow', axis=1).round(4)
    else:
        data4 = data3.round(4)

    if export:
        if sites_by_col:
            data4.to_csv(export_path)
        else:
            data4.to_csv(export_path, index=False)

    return (data4)
コード例 #10
0
ファイル: flow.py プロジェクト: mullenkamp/HydroPandas
def rd_hydrotel(sites, mtype='river_flow_cont_raw', from_date=None, to_date=None, resample_code='D', period=1,
                fun='mean', val_round=3, min_count=None, pivot=False, export_path=None):
    """
    Function to extract time series data from the hydrotel database.

    Parameters
    ----------
    sites : list, array, dataframe, or str
        Site list or a str path to a single column csv file of site names/numbers.
    mtype : str
        'flow_tel', 'gwl_tel', 'precip_tel', 'swl_tel', or 'wtemp_tel'.
    from_date : str or None
        The start date in the format '2000-01-01'.
    to_date : str or None
        The end date in the format '2000-01-01'.
    resample_code : str
        The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc.
    period : int
        The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period.
    fun : str
        The resampling function. i.e. mean, sum, count, min, or max. No median yet...
    val_round : int
        The number of decimals to round the values.
    pivot : bool
        Should the output be pivotted into wide format?
    export_path : str or None
        The path and file name to be saved.

    Returns
    -------
    Series or DataFrame
        A MultiIndex Pandas Series if pivot is False and a DataFrame if True
    """

    #### mtypes dict
    mtypes_dict = {'river_flow_cont_raw': 'Flow Rate', 'aq_wl_cont_raw': 'Water Level',
                   'atmos_precip_cont_raw': 'Rainfall Depth', 'river_wl_cont_raw': 'Water Level',
                   'river_wtemp_cont_raw': 'Water Temperature'}

    #### Database parameters
    server = 'SQL2012PROD05'
    database = 'Hydrotel'

    data_tab = 'Hydrotel.dbo.Samples'
    points_tab = 'Hydrotel.dbo.Points'
    objects_tab = 'Hydrotel.dbo.Objects'
    mtypes_tab = 'Hydrotel.dbo.ObjectVariants'
    sites_tab = 'Hydrotel.dbo.Sites'

    data_col = ['Point', 'DT', 'SampleValue']
    points_col = ['Point', 'Object']
    objects_col = ['Object', 'Site', 'Name', 'ObjectVariant']
    mtypes_col = ['ObjectVariant', 'Name']
    sites_col = ['Site', 'Name', 'ExtSysId']

    #### Import data and select the correct sites

    sites = select_sites(sites)
    if mtype == 'atmos_precip_cont_raw':
        site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'], 'ExtSysId',
                          sites.astype('int32').tolist())
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'], 'Site', site_ob1.Site.tolist())
        site_val1 = merge(site_val0, site_ob1, on='Site')
    elif mtype == 'aq_wl_cont_raw':
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'])
        site_val0.loc[:, 'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0], axis=1)
        site_val1 = site_val0[site_val0.Name.isin(sites)]
        site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name']
    else:
        site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId', sites.astype('int32').tolist())

    if site_val1.empty:
        raise ValueError('No site(s) in database')

    site_val1.loc[:, 'ExtSysId'] = to_numeric(site_val1.loc[:, 'ExtSysId'], errors='ignore')
    site_val = site_val1.Site.astype('int32').tolist()
    if isinstance(mtype, (list, ndarray, Series)):
        mtypes = [mtypes_dict[i] for i in mtype]
    elif isinstance(mtype, str):
        mtypes = [mtypes_dict[mtype]]
    else:
        raise ValueError('mtype must be a str, list, ndarray, or Series.')
    mtypes_val = rd_sql(server, database, mtypes_tab, mtypes_col, 'Name', mtypes)

    where_col = {'Site': site_val, 'ObjectVariant': mtypes_val.ObjectVariant.astype('int32').tolist()}

    object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col)
    if mtype == 'gwl_tel':
        object_val1 = object_val1[object_val1.Name == 'Water Level']
    if mtype == 'precip_tel':
        object_val1 = object_val1[object_val1.Name == 'Rainfall']
    object_val = object_val1.Object.values.astype(int).tolist()

    #### Rearrange data
    point_val1 = rd_sql(server, database, points_tab, points_col, where_col='Object', where_val=object_val)
    point_val = point_val1.Point.values.astype(int).tolist()

    #### Big merge
    comp_tab1 = merge(site_val1, object_val1[['Object', 'Site']], on='Site')
    comp_tab2 = merge(comp_tab1, point_val1, on='Object')
    comp_tab2.set_index('Point', inplace=True)

    #### Pull out the data
    ### Make SQL statement
    data1 = rd_sql_ts(server, database, data_tab, 'Point', 'DT', 'SampleValue', resample_code, period, fun, val_round,
                      {'Point': point_val}, from_date=from_date, to_date=to_date, min_count=min_count)['SampleValue']

    data1.index.names = ['site', 'time']
    data1.name = 'value'
    site_numbers = [comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0]]
    data1.index.set_levels(site_numbers, level='site', inplace=True)

    if pivot:
        data3 = data1.unstack(0)
    else:
        data3 = data1

    #### Export and return
    if export_path is not None:
        save_df(data3, export_path)

    return data3
コード例 #11
0
def rd_squalarc(sites,
                mtypes=None,
                from_date=None,
                to_date=None,
                convert_dtl=False,
                dtl_method=None,
                export_path=None):
    """
    Function to read in "squalarc" data. Which is atually stored in the mssql db.

    Arguments:\n
    sites -- The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest.\n
    mtypes -- A list of measurement type names to be in the output. Leaving it empty returns all mtypes.\n
    from_date -- A start date string in of '2010-01-01'.\n
    to_date -- A end date string in of '2011-01-01'.\n
    convert_dtl -- Should values under the detection limit be converted to numeric?\n
    dtl_method -- The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    """

    #### Read in sites
    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = Series(sites1, name='site').astype(str).tolist()

    #### Extract the rest of the data
    if len(sites2) > 10000:
        n_chunks = int(ceil(len(sites2) * 0.0001))
        sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)]
        samples_tab = DataFrame()
        for i in sites3:
            samples_tab1 = rd_sql('SQL2012PROD05',
                                  'Squalarc',
                                  '"SQL_SAMPLE_METHODS+"',
                                  col_names=[
                                      'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                      'Collect_Date', 'Collect_Time',
                                      'PA_NAME', 'PARAM_UNITS', 'SRESULT'
                                  ],
                                  where_col='Site_ID',
                                  where_val=i)
            samples_tab1.columns = [
                'site', 'sample_id', 'source', 'date', 'time', 'parameter',
                'units', 'val'
            ]
            samples_tab1.loc[:,
                             'source'] = samples_tab1.loc[:,
                                                          'source'].str.lower(
                                                          )
            samples_tab = concat([samples_tab, samples_tab1])
    else:
        samples_tab = rd_sql('SQL2012PROD05',
                             'Squalarc',
                             '"SQL_SAMPLE_METHODS+"',
                             col_names=[
                                 'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                 'Collect_Date', 'Collect_Time', 'PA_NAME',
                                 'PARAM_UNITS', 'SRESULT'
                             ],
                             where_col='Site_ID',
                             where_val=sites2)
        samples_tab.columns = [
            'site', 'sample_id', 'source', 'date', 'time', 'parameter',
            'units', 'val'
        ]
        samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower()

    samples_tab2 = samples_tab.copy()
    num_test = to_numeric(samples_tab2.loc[:, 'time'], 'coerce')
    samples_tab2.loc[num_test.isnull(), 'time'] = '0000'
    samples_tab2.loc[:,
                     'time'] = samples_tab2.loc[:,
                                                'time'].str.replace('.', '')
    samples_tab2 = samples_tab2[samples_tab2.date.notnull()]
    #    samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000')
    time1 = to_datetime(samples_tab2.time, format='%H%M', errors='coerce')
    time1[time1.isnull()] = Timestamp('2000-01-01 00:00:00')
    datetime1 = to_datetime(
        samples_tab2.date.dt.date.astype(str) + ' ' +
        time1.dt.time.astype(str))
    samples_tab2.loc[:, 'date'] = datetime1
    samples_tab2 = samples_tab2.drop('time', axis=1)
    samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = nan
    samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = nan

    #### Select within time range
    if isinstance(from_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date]
    if isinstance(to_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date]

    if mtypes is not None:
        mtypes1 = select_sites(mtypes)
        data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index(
            drop=True)
    else:
        data = samples_tab2.reset_index(drop=True)

    #### Correct poorly typed in site names
    data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '')

    #### Convert detection limit values
    if convert_dtl:
        less1 = data['val'].str.match('<')
        if less1.sum() > 0:
            less1.loc[less1.isnull()] = False
            data2 = data.copy()
            data2.loc[less1,
                      'val'] = to_numeric(data.loc[less1, 'val'].str.replace(
                          '<', ''),
                                          errors='coerce') * 0.5
            if dtl_method in (None, 'standard'):
                data3 = data2
            if dtl_method == 'trend':
                df1 = data2.loc[less1]
                count1 = data.groupby('parameter')['val'].count()
                count1.name = 'tot_count'
                count_dtl = df1.groupby('parameter')['val'].count()
                count_dtl.name = 'dtl_count'
                count_dtl_val = df1.groupby('parameter')['val'].nunique()
                count_dtl_val.name = 'dtl_val_count'
                combo1 = concat([count1, count_dtl, count_dtl_val],
                                axis=1,
                                join='inner')
                combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                       combo1['tot_count']).round(2)

                ## conditionals
                #            param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)]
                #            under_40 = data['parameter'].isin(param1.index)
                param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                & (combo1['dtl_val_count'] != 1)]
                over_40 = data['parameter'].isin(param2.index)

                ## Calc detection limit values
                data3 = merge(data,
                              combo1['dtl_ratio'].reset_index(),
                              on='parameter',
                              how='left')
                data3.loc[:, 'val_dtl'] = data2['val']

                max_dtl_val = data2[over_40 & less1].groupby(
                    'parameter')['val'].transform('max')
                max_dtl_val.name = 'dtl_val_max'
                data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val
        else:
            data3 = data
    else:
        data3 = data

    #### Return and export
    if isinstance(export_path, str):
        data3.to_csv(export_path, encoding='utf-8', index=False)
    return (data3)
コード例 #12
0
def rd_ht_wq_data(hts,
                  sites=None,
                  mtypes=None,
                  start=None,
                  end=None,
                  dtl_method=None,
                  output_site_data=False,
                  mtype_params=None,
                  sample_params=None):
    """
    Function to read data from an hts file and optionally select specific sites and aggregate the data.

    Parameters
    ----------
    hts : str
        Path to the hts file.
    sites : list
        A list of site names within the hts file.
    mtypes : list
        A list of measurement types that should be returned.
    start : str
        The start date to retreive from the data in ISO format (e.g. '2011-11-30 00:00').
    end : str
        The end date to retreive from the data in ISO format (e.g. '2011-11-30 00:00').
    dtl_method : None, 'standard', 'trend'
        The method to use to convert values under a detection limit to numeric. None does no conversion. 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    output_site_data : bool
        Should the site data be output?

    Returns
    -------
    DataFrame
    """

    #    agg_unit_dict = {'l/s': 1, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4}
    #    unit_convert = {'l/s': 0.001, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4}

    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = sites1

    ### First read all of the sites in the hts file and select the ones to be read
    sites_df = rd_hilltop_sites(hts,
                                sites=sites2,
                                mtypes=mtypes,
                                rem_wq_sample=False)

    ### Open the hts file
    wqr = Dispatch("Hilltop.WQRetrieval")
    dfile = Dispatch("Hilltop.DataFile")
    try:
        dfile.Open(hts)
    except ValueError:
        print(dfile.errmsg)

    ### Iterate through he hts file
    df_lst = []
    for i in sites_df.index:
        site = sites_df.loc[i, 'site']
        mtype = sites_df.loc[i, 'mtype']
        if mtype == 'WQ Sample':
            continue
        wqr = dfile.FromWQSite(site, mtype)

        ## Set up start and end times and aggregation initiation
        if (start is None):
            start1 = wqr.DataStartTime
        else:
            start1 = start
        if end is None:
            end1 = wqr.DataEndTime
        else:
            end1 = end

        wqr.FromTimeRange(start1, end1)

        ## Extract data
        data = []
        time = []

        test_params = sites_df[sites_df.site == site].mtype.unique()
        if ('WQ Sample' in test_params) & (isinstance(mtype_params, list)
                                           | isinstance(sample_params, list)):
            sample_p = []
            mtype_p = []
            while wqr.GetNext:
                data.append(wqr.value)
                time.append(str(pytime_to_datetime(wqr.time)))
                sample_p.append({
                    sp: wqr.params(sp).encode('ascii', 'ignore')
                    for sp in sample_params
                })
                mtype_p.append({
                    mp: wqr.params(mp).encode('ascii', 'ignore')
                    for mp in mtype_params
                })
        else:
            while wqr.GetNext:
                data.append(wqr.value)
                time.append(str(pytime_to_datetime(wqr.time)))

        if data:
            df_temp = DataFrame({
                'time': time,
                'data': data,
                'site': site,
                'mtype': mtype
            })
            if sample_p:
                df_temp = concat(
                    [df_temp, DataFrame(sample_p),
                     DataFrame(mtype_p)], axis=1)
            df_lst.append(df_temp)

    dfile.Close()
    wqr.close()
    if df_lst:
        data = concat(df_lst)
        data.loc[:, 'time'] = to_datetime(data.loc[:, 'time'])
        data1 = to_numeric(data.loc[:, 'data'], errors='coerce')
        data.loc[data1.notnull(), 'data'] = data1[data1.notnull()]
        #        data.loc[:, 'data'].str.replace('*', '')
        data = data.reset_index(drop=True)

        #### Convert detection limit values
        if dtl_method is not None:
            less1 = data['data'].str.match('<')
            if less1.sum() > 0:
                less1.loc[less1.isnull()] = False
                data2 = data.copy()
                data2.loc[less1, 'data'] = to_numeric(
                    data.loc[less1, 'data'].str.replace('<', ''),
                    errors='coerce') * 0.5
                if dtl_method == 'standard':
                    data3 = data2
                if dtl_method == 'trend':
                    df1 = data2.loc[less1]
                    count1 = data.groupby('mtype')['data'].count()
                    count1.name = 'tot_count'
                    count_dtl = df1.groupby('mtype')['data'].count()
                    count_dtl.name = 'dtl_count'
                    count_dtl_val = df1.groupby('mtype')['data'].nunique()
                    count_dtl_val.name = 'dtl_val_count'
                    combo1 = concat([count1, count_dtl, count_dtl_val],
                                    axis=1,
                                    join='inner')
                    combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                           combo1['tot_count']).round(2)

                    ## conditionals
                    param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                    & (combo1['dtl_val_count'] != 1)]
                    over_40 = data['mtype'].isin(param2.index)

                    ## Calc detection limit values
                    data3 = merge(data,
                                  combo1['dtl_ratio'].reset_index(),
                                  on='mtype',
                                  how='left')
                    data3.loc[:, 'data_dtl'] = data2['data']

                    max_dtl_val = data2[over_40 & less1].groupby(
                        'mtype')['data'].transform('max')
                    max_dtl_val.name = 'dtl_data_max'
                    data3.loc[over_40 & less1, 'data_dtl'] = max_dtl_val
            else:
                data3 = data
        else:
            data3 = data

        if output_site_data:
            sites_df = sites_df[~(sites_df.mtype == 'WQ Sample')]
            return (data3, sites_df)
        else:
            return (data3)
コード例 #13
0
def rd_hilltop_sites(hts, sites=None, mtypes=None, rem_wq_sample=True):
    """
    Function to read the site names, measurement types, and units of a Hilltop hts file. Returns a DataFrame.

    Arguments:\n
    hts -- Path to the hts file (str).\n
    sites -- A list of site names within the hts file.\n
    mtypes -- A list of measurement types that should be returned.
    """

    if sites is not None:
        sites = select_sites(sites)
    if mtypes is not None:
        mtypes = select_sites(mtypes)

    cat = Dispatch("Hilltop.Catalogue")
    if not cat.Open(hts):
        raise ValueError(cat.errmsg)

    dfile = Dispatch("Hilltop.DataRetrieval")
    try:
        dfile.Open(hts)
    except ValueError:
        print(dfile.errmsg)

    sites_lst = []

    ### Iterate through all sites/datasources/mtypes
    cat.StartSiteEnum
    while cat.GetNextSite:
        site_name = cat.SiteName
        if sites is None:
            pass
        elif site_name in sites:
            pass
        else:
            continue
        while cat.GetNextDataSource:
            ds_name = cat.DataSource
            try:
                start1 = pytime_to_datetime(cat.DataStartTime)
                end1 = pytime_to_datetime(cat.DataEndTime)
            except ValueError:
                bool_site = dfile.FromSite(site_name, ds_name, 1)
                if bool_site:
                    start1 = pytime_to_datetime(cat.DataStartTime)
                    end1 = pytime_to_datetime(cat.DataEndTime)
                else:
                    print('No site data for ' + site_name +
                          '...for some reason...')
            while cat.GetNextMeasurement:
                mtype1 = cat.Measurement
                if mtype1 == 'Item2':
                    continue
                elif mtypes is None:
                    pass
                elif mtype1 in mtypes:
                    pass
                else:
                    continue
                divisor = cat.Divisor
                unit1 = cat.Units
                if unit1 == '%':
                    #                    print('Site ' + name1 + ' has no units')
                    unit1 = ''
                sites_lst.append([
                    site_name, ds_name, mtype1, unit1, divisor,
                    str(start1),
                    str(end1)
                ])

    sites_df = DataFrame(sites_lst,
                         columns=[
                             'site', 'data_source', 'mtype', 'unit', 'divisor',
                             'start_date', 'end_date'
                         ])
    if rem_wq_sample:
        sites_df = sites_df[~(sites_df.mtype == 'WQ Sample')]
    dfile.Close()
    cat.Close()
    return (sites_df)