Python combine_datasetsの例、ooi_data_explorations.combine_data.combine_datasets Pythonの例

コード例 #1

0

ファイルを表示

def combine_delivery_methods(site, node, sensor):
    """
    Takes the downloaded data from each of the three data delivery methods and
    combines them into a single, merged xarray data set.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :return merged:
    """
    # download the telemetered data and re-process it to create a more useful and coherent data set
    tag = '.*PHSEN.*\\.nc$'
    telem = load_gc_thredds(site, node, sensor, 'telemetered',
                            'phsen_abcdef_dcl_instrument', tag)
    telem = phsen_datalogger(telem)

    # download the recovered host data and re-process it to create a more useful and coherent data set
    rhost = load_gc_thredds(site, node, sensor, 'recovered_host',
                            'phsen_abcdef_dcl_instrument_recovered', tag)
    rhost = phsen_datalogger(rhost)

    # download the recovered instrument data and re-process it to create a more useful and coherent data set
    rinst = load_gc_thredds(site, node, sensor, 'recovered_inst',
                            'phsen_abcdef_instrument', tag)
    rinst = phsen_instrument(rinst)

    # combine the three datasets into a single, merged time series resampled to a 3 hour interval time series
    merged = combine_datasets(telem, rhost, rinst, 180)

    # re-run the quality checks, since averaging will change the flag values
    merged['seawater_ph_quality_flag'] = quality_checks(merged)
    return merged

コード例 #2

0

ファイルを表示

ファイル: qartod_ce_ctdbp.py プロジェクト: reedan88/ooi-data-explorations

def combine_delivery_methods(site, node, sensor):
    """
    Takes the downloaded data from each of the three data delivery methods for
    the uncabled CTD (CTDBP), and combines each of them into a single, merged
    xarray data set.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :return merged: the merged CTDBP data stream resampled to a 3-hour time
        record
    """
    # download the telemetered data and re-process it to create a more useful and coherent data set
    tag = '.*CTDBP.*\\.nc$'
    telem = load_gc_thredds(site, node, sensor, 'telemetered', 'ctdbp_cdef_dcl_instrument', tag)
    telem = ctdbp_datalogger(telem)

    # download the recovered host data and re-process it to create a more useful and coherent data set
    rhost = load_gc_thredds(site, node, sensor, 'recovered_host', 'ctdbp_cdef_dcl_instrument_recovered', tag)
    rhost = ctdbp_datalogger(rhost)

    # download the recovered instrument data and re-process it to create a more useful and coherent data set
    rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', 'ctdbp_cdef_instrument_recovered', tag)
    rinst = ctdbp_instrument(rinst)

    # combine the three datasets into a single, merged time series resampled to a 3-hour interval time series
    merged = combine_datasets(telem, rhost, rinst, 180)

    return merged

コード例 #3

0

ファイルを表示

ファイル: qartod_ce_presf.py プロジェクト: reedan88/ooi-data-explorations

def combine_delivery_methods(site, node, sensor):
    """
    Takes the downloaded data from each of the three data delivery methods for
    the seafloor pressure sensor (PRESF), and combines each of them into a
    single, merged xarray data sets.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :return merged: the atmospheric or surface seawater pCO2 data stream
        resampled to a 3 hour time record
    """
    # download the telemetered, recovered_host and recovered_inst data and re-process it to create
    # a more useful and coherent data set
    tag = '.*PRESF.*\\.nc$'
    telem = load_gc_thredds(site, node, sensor, 'telemetered',
                            'presf_abc_dcl_tide_measurement', tag)
    rhost = load_gc_thredds(site, node, sensor, 'recovered_host',
                            'presf_abc_dcl_tide_measurement_recovered', tag)
    rinst = load_gc_thredds(site, node, sensor, 'recovered_inst',
                            'presf_abc_tide_measurement_recovered', tag)

    # combine the two datasets into a single, merged time series resampled to an hourly time series
    merged = combine_datasets(telem, rhost, rinst, 60)

    return merged

コード例 #4

0

ファイルを表示

ファイル: qartod_ce_pco2a.py プロジェクト: crisien/ooi-data-explorations

def combine_delivery_methods(site, node, sensor, source):
    """
    Takes the downloaded data from each of the two data delivery methods for
    the atmospheric pCO2 data stream (which also contains the surface seawater
    pCO2), and combines each of them into a single, merged xarray data sets.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :param source: specifiy whether this is the air or water stream
    :return merged: the atmospheric or surface seawater pCO2 data stream
        resampled to a 3 hour time record
    """
    # download the telemetered data and re-process it to create a more useful and coherent data set
    if source == 'air':
        tag = '.*PCO2A.*air.*\\.nc$'
        tstream = 'pco2a_a_dcl_instrument_air'
        rstream = 'pco2a_a_dcl_instrument_air_recovered'
    else:
        tag = '.*PCO2A.*water.*\\.nc$'
        tstream = 'pco2a_a_dcl_instrument_water'
        rstream = 'pco2a_a_dcl_instrument_water_recovered'

    telem = load_gc_thredds(site, node, sensor, 'telemetered', tstream, tag)
    telem = pco2a_datalogger(telem)

    # download the recovered host data and re-process it to create a more useful and coherent data set
    rhost = load_gc_thredds(site, node, sensor, 'recovered_host', rstream, tag)
    rhost = pco2a_datalogger(rhost)

    # combine the two datasets into a single, merged time series resampled to a 3 hour interval time series
    merged = combine_datasets(telem, rhost, None, 180)

    return merged

コード例 #5

0

ファイルを表示

def combine_delivery_methods(site, node, sensor):
    """
    Takes the downloaded data from the different data delivery methods for the
    three-channel fluorometer (FLORT), and combines them, where appropriate,
    into a single, merged xarray data sets.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :return merged: the merged and resampled (if appropriate) FLORT dataset
    """
    # set the stream and tag constants
    tag = '.*FLORT.*\\.nc$'
    stream = 'flort_sample'

    if node in ['SP001', 'WFP01']:
        # this FLORT is part of a CSPP or WFP and includes telemetered and recovered data
        if node == 'SP001':
            telem = None  # don't use the telemetered CSPP data
            print(
                '##### Downloading the recovered_cspp FLORT data for %s #####'
                % site)
            rhost = load_gc_thredds(site, node, sensor, 'recovered_cspp',
                                    stream, tag)
            deployments = []
            print('# -- Group the data by deployment and process the data')
            grps = list(rhost.groupby('deployment'))
            for grp in grps:
                print('# -- Processing recovered_host deployment %s' % grp[0])
                deployments.append(flort_cspp(grp[1]))
            deployments = [i for i in deployments if i]
            rhost = xr.concat(deployments, 'time')
        else:
            print('##### Downloading the telemetered FLORT data for %s #####' %
                  site)
            telem = load_gc_thredds(site, node, sensor, 'telemetered', stream,
                                    tag)
            deployments = []
            print('# -- Group the data by deployment and process the data')
            grps = list(telem.groupby('deployment'))
            for grp in grps:
                print('# -- Processing telemetered deployment %s' % grp[0])
                deployments.append(flort_wfp(grp[1]))
            deployments = [i for i in deployments if i]
            telem = xr.concat(deployments, 'time')

            print(
                '##### Downloading the recovered_wfp FLORT data for %s #####' %
                site)
            rhost = load_gc_thredds(site, node, sensor, 'recovered_wfp',
                                    stream, tag)
            deployments = []
            print('# -- Group the data by deployment and process the data')
            grps = list(rhost.groupby('deployment'))
            for grp in grps:
                print('# -- Processing recovered_host deployment %s' % grp[0])
                deployments.append(flort_wfp(grp[1]))
            deployments = [i for i in deployments if i]
            rhost = xr.concat(deployments, 'time')

        # merge, but do not resample the time records.
        merged = combine_datasets(telem, rhost, None, None)
    elif node == 'SBD17':
        # this FLORT is mounted on the buoy of the Inshore moorings and includes all three types of data
        print('##### Downloading the telemetered FLORT data for %s #####' %
              site)
        telem = load_gc_thredds(site, node, sensor, 'telemetered', stream, tag)
        deployments = []
        print('# -- Group the data by deployment and process the data')
        grps = list(telem.groupby('deployment'))
        for grp in grps:
            print('# -- Processing telemetered deployment %s' % grp[0])
            deployments.append(flort_instrument(grp[1]))
        deployments = [i for i in deployments if i]
        telem = xr.concat(deployments, 'time')

        print('##### Downloading the recovered_host FLORT data for %s #####' %
              site)
        rhost = load_gc_thredds(site, node, sensor, 'recovered_host', stream,
                                tag)
        deployments = []
        print('# -- Group the data by deployment and process the data')
        grps = list(rhost.groupby('deployment'))
        for grp in grps:
            print('# -- Processing recovered_host deployment %s' % grp[0])
            deployments.append(flort_instrument(grp[1]))
        deployments = [i for i in deployments if i]
        rhost = xr.concat(deployments, 'time')

        print('##### Downloading the recovered_inst FLORT data for %s #####' %
              site)
        rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', stream,
                                tag)
        deployments = []
        print('# -- Group the data by deployment and process the data')
        grps = list(rinst.groupby('deployment'))
        for grp in grps:
            print('# -- Processing recovered_inst deployment %s' % grp[0])
            deployments.append(flort_instrument(grp[1]))
        deployments = [i for i in deployments if i]
        rinst = xr.concat(deployments, 'time')

        # merge and resample to a 2 hour data record
        merged = combine_datasets(telem, rhost, rinst, 120)
    else:
        # this FLORT is standalone on one of the NSIFs and includes the telemetered and recovered_host data
        # data is collected in bursts (3 minutes at 1 Hz). process each data set per-deployment
        print('##### Downloading the telemetered FLORT data for %s #####' %
              site)
        telem = load_gc_thredds(site, node, sensor, 'telemetered', stream, tag)
        deployments = []
        print('# -- Group the data by deployment and process the data')
        grps = list(telem.groupby('deployment'))
        for grp in grps:
            print('# -- Processing telemetered deployment %s' % grp[0])
            deployments.append(flort_datalogger(grp[1], True))
        deployments = [i for i in deployments if i]
        telem = xr.concat(deployments, 'time')

        print('##### Downloading the recovered_host FLORT data for %s #####' %
              site)
        rhost = load_gc_thredds(site, node, sensor, 'recovered_host', stream,
                                tag)
        deployments = []
        print('# -- Group the data by deployment and process the data')
        grps = list(rhost.groupby('deployment'))
        for grp in grps:
            print('# -- Processing recovered_host deployment %s' % grp[0])
            deployments.append(flort_datalogger(grp[1], True))
        deployments = [i for i in deployments if i]
        rhost = xr.concat(deployments, 'time')

        # combine the datasets, leaving them as 15-minute median averaged datasets
        merged = combine_datasets(telem, rhost, None, None)

    return merged

コード例 #6

0

ファイルを表示

ファイル: qartod_ce_pco2w.py プロジェクト: crisien/ooi-data-explorations

def generate_qartod(site, node, sensor, cut_off):
    """
    Load all of the pCO2 data for a defined reference designator (using the
    site, node and sensor names to construct the reference designator)
    collected via the recovered instrument method and combine them into a
    single data set from which QARTOD test limits for the gross range and
    climatology tests can be calculated.

    :param site: Site designator, extracted from the first part of the
        reference designator
    :param node: Node designator, extracted from the second part of the
        reference designator
    :param sensor: Sensor designator, extracted from the third and fourth part
        of the reference designator
    :param cut_off: string formatted date to use as cut-off for data to add
        to QARTOD test sets
    :return annotations: Initial list of auto-generated HITL annotations as
        a pandas dataframe
    :return gr_lookup: CSV formatted strings to save to a csv file for the
        QARTOD gross range lookup tables.
    :return clm_lookup: CSV formatted strings to save to a csv file for the
        QARTOD climatology lookup tables.
    :return clm_table: CSV formatted strings to save to a csv file for the
        QARTOD climatology range tables.
    """
    # load the recovered instrument data
    data = load_gc_thredds(site, node, sensor, 'recovered_inst',
                           'pco2w_abc_instrument', '^(?!.*blank).*PCO2W.*nc$')
    data = pco2w_instrument(data)

    # resample the data into a 3 hour, median averaged time series
    data = combine_datasets(data, None, None, 180)

    # recalculate the quality flags as averaging will alter them
    data['pco2_seawater_quality_flag'] = quality_checks(data)

    # create a boolean array of the data marked as "fail" by the pCO2 quality checks and generate initial
    # HITL annotations that can be combined with system annotations and pCO2 quality checks to create
    # a cleaned up data set prior to calculating the QARTOD test values
    fail = data.pco2_seawater_quality_flag.where(
        data.pco2_seawater_quality_flag == 4).notnull()
    blocks = identify_blocks(fail, [24, 96])
    hitl = create_annotations(site, node, sensor, blocks)

    # get the current system annotations for the sensor
    annotations = get_annotations(site, node, sensor)
    annotations = pd.DataFrame(annotations)
    if not annotations.empty:
        annotations = annotations.drop(columns=['@class'])
        annotations['beginDate'] = pd.to_datetime(
            annotations.beginDT, unit='ms').dt.strftime('%Y-%m-%dT%H:%M:%S')
        annotations['endDate'] = pd.to_datetime(
            annotations.endDT, unit='ms').dt.strftime('%Y-%m-%dT%H:%M:%S')

    # append the fail annotations to the existing annotations
    annotations = annotations.append(pd.DataFrame(hitl),
                                     ignore_index=True,
                                     sort=False)

    # create a roll-up annotation flag
    data = add_annotation_qc_flags(data, annotations)

    # clean-up the data, removing values that fail the pCO2 quality checks or were marked as fail in the annotations
    data = data.where((data.pco2_seawater_quality_flag != 4)
                      & (data.rollup_annotations_qc_results != 4))

    # if a cut_off date was used, limit data to all data collected up to the cut_off date.
    # otherwise, set the limit to the range of the downloaded data.
    if cut_off:
        cut = parser.parse(cut_off)
        cut = cut.astimezone(pytz.utc)
        end_date = cut.strftime('%Y-%m-%dT%H:%M:%S')
        src_date = cut.strftime('%Y-%m-%d')
    else:
        cut = parser.parse(data.time_coverage_end)
        cut = cut.astimezone(pytz.utc)
        end_date = cut.strftime('%Y-%m-%dT%H:%M:%S')
        src_date = cut.strftime('%Y-%m-%d')

    data = data.sel(time=slice('2014-01-01T00:00:00', end_date))

    # create the initial gross range entry
    gr = process_gross_range(data, ['pco2_seawater'], [200, 2000],
                             site=site,
                             node=node,
                             sensor=sensor)

    # re-work gross entry for the different streams and parameter names
    gr_lookup = pd.DataFrame()
    gr_lookup = gr_lookup.append([gr, gr, gr], ignore_index=True)
    gr_lookup['parameter'][0] = {'inp': 'pco2_seawater'}
    gr_lookup['stream'][0] = 'pco2w_abc_dcl_instrument'
    gr_lookup['parameter'][1] = {'inp': 'pco2_seawater'}
    gr_lookup['stream'][1] = 'pco2w_abc_dcl_instrument_recovered'
    gr_lookup['parameter'][2] = {'inp': 'pco2_seawater'}
    gr_lookup['stream'][2] = 'pco2w_abc_instrument'
    gr_lookup['source'] = (
        'Sensor min/max based on the vendor standard calibration range. '
        'The user min/max is the historical mean of all data collected '
        'up to {} +/- 3 standard deviations.'.format(src_date))

    # create and format the climatology entry and table
    cll, clm_table = process_climatology(data, ['pco2_seawater'], [200, 2000],
                                         site=site,
                                         node=node,
                                         sensor=sensor)

    # re-work climatology entry for the different streams and parameter names
    clm_lookup = pd.DataFrame()
    clm_lookup = clm_lookup.append([cll, cll, cll])
    clm_lookup['parameters'][0] = {
        'inp': 'pco2_seawater',
        'tinp': 'time',
        'zinp': 'None'
    }
    clm_lookup['stream'][0] = 'pco2w_abc_dcl_instrument'
    clm_lookup['parameters'][1] = {
        'inp': 'pco2_seawater',
        'tinp': 'time',
        'zinp': 'None'
    }
    clm_lookup['stream'][1] = 'pco2w_abc_dcl_instrument_recovered'
    clm_lookup['parameters'][2] = {
        'inp': 'pco2_seawater',
        'tinp': 'time',
        'zinp': 'None'
    }
    clm_lookup['stream'][2] = 'pco2w_abc_instrument'

    return annotations, gr_lookup, clm_lookup, clm_table