def combine_delivery_methods(site, node, sensor): """ Takes the downloaded data from each of the three data delivery methods and combines them into a single, merged xarray data set. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :return merged: """ # download the telemetered data and re-process it to create a more useful and coherent data set tag = '.*PHSEN.*\\.nc$' telem = load_gc_thredds(site, node, sensor, 'telemetered', 'phsen_abcdef_dcl_instrument', tag) telem = phsen_datalogger(telem) # download the recovered host data and re-process it to create a more useful and coherent data set rhost = load_gc_thredds(site, node, sensor, 'recovered_host', 'phsen_abcdef_dcl_instrument_recovered', tag) rhost = phsen_datalogger(rhost) # download the recovered instrument data and re-process it to create a more useful and coherent data set rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', 'phsen_abcdef_instrument', tag) rinst = phsen_instrument(rinst) # combine the three datasets into a single, merged time series resampled to a 3 hour interval time series merged = combine_datasets(telem, rhost, rinst, 180) # re-run the quality checks, since averaging will change the flag values merged['seawater_ph_quality_flag'] = quality_checks(merged) return merged
def combine_delivery_methods(site, node, sensor): """ Takes the downloaded data from each of the three data delivery methods for the uncabled CTD (CTDBP), and combines each of them into a single, merged xarray data set. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :return merged: the merged CTDBP data stream resampled to a 3-hour time record """ # download the telemetered data and re-process it to create a more useful and coherent data set tag = '.*CTDBP.*\\.nc$' telem = load_gc_thredds(site, node, sensor, 'telemetered', 'ctdbp_cdef_dcl_instrument', tag) telem = ctdbp_datalogger(telem) # download the recovered host data and re-process it to create a more useful and coherent data set rhost = load_gc_thredds(site, node, sensor, 'recovered_host', 'ctdbp_cdef_dcl_instrument_recovered', tag) rhost = ctdbp_datalogger(rhost) # download the recovered instrument data and re-process it to create a more useful and coherent data set rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', 'ctdbp_cdef_instrument_recovered', tag) rinst = ctdbp_instrument(rinst) # combine the three datasets into a single, merged time series resampled to a 3-hour interval time series merged = combine_datasets(telem, rhost, rinst, 180) return merged
def combine_delivery_methods(site, node, sensor): """ Takes the downloaded data from each of the three data delivery methods for the seafloor pressure sensor (PRESF), and combines each of them into a single, merged xarray data sets. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :return merged: the atmospheric or surface seawater pCO2 data stream resampled to a 3 hour time record """ # download the telemetered, recovered_host and recovered_inst data and re-process it to create # a more useful and coherent data set tag = '.*PRESF.*\\.nc$' telem = load_gc_thredds(site, node, sensor, 'telemetered', 'presf_abc_dcl_tide_measurement', tag) rhost = load_gc_thredds(site, node, sensor, 'recovered_host', 'presf_abc_dcl_tide_measurement_recovered', tag) rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', 'presf_abc_tide_measurement_recovered', tag) # combine the two datasets into a single, merged time series resampled to an hourly time series merged = combine_datasets(telem, rhost, rinst, 60) return merged
def combine_delivery_methods(site, node, sensor, source): """ Takes the downloaded data from each of the two data delivery methods for the atmospheric pCO2 data stream (which also contains the surface seawater pCO2), and combines each of them into a single, merged xarray data sets. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :param source: specifiy whether this is the air or water stream :return merged: the atmospheric or surface seawater pCO2 data stream resampled to a 3 hour time record """ # download the telemetered data and re-process it to create a more useful and coherent data set if source == 'air': tag = '.*PCO2A.*air.*\\.nc$' tstream = 'pco2a_a_dcl_instrument_air' rstream = 'pco2a_a_dcl_instrument_air_recovered' else: tag = '.*PCO2A.*water.*\\.nc$' tstream = 'pco2a_a_dcl_instrument_water' rstream = 'pco2a_a_dcl_instrument_water_recovered' telem = load_gc_thredds(site, node, sensor, 'telemetered', tstream, tag) telem = pco2a_datalogger(telem) # download the recovered host data and re-process it to create a more useful and coherent data set rhost = load_gc_thredds(site, node, sensor, 'recovered_host', rstream, tag) rhost = pco2a_datalogger(rhost) # combine the two datasets into a single, merged time series resampled to a 3 hour interval time series merged = combine_datasets(telem, rhost, None, 180) return merged
def combine_delivery_methods(site, node, sensor): """ Takes the downloaded data from the different data delivery methods for the three-channel fluorometer (FLORT), and combines them, where appropriate, into a single, merged xarray data sets. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :return merged: the merged and resampled (if appropriate) FLORT dataset """ # set the stream and tag constants tag = '.*FLORT.*\\.nc$' stream = 'flort_sample' if node in ['SP001', 'WFP01']: # this FLORT is part of a CSPP or WFP and includes telemetered and recovered data if node == 'SP001': telem = None # don't use the telemetered CSPP data print( '##### Downloading the recovered_cspp FLORT data for %s #####' % site) rhost = load_gc_thredds(site, node, sensor, 'recovered_cspp', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(rhost.groupby('deployment')) for grp in grps: print('# -- Processing recovered_host deployment %s' % grp[0]) deployments.append(flort_cspp(grp[1])) deployments = [i for i in deployments if i] rhost = xr.concat(deployments, 'time') else: print('##### Downloading the telemetered FLORT data for %s #####' % site) telem = load_gc_thredds(site, node, sensor, 'telemetered', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(telem.groupby('deployment')) for grp in grps: print('# -- Processing telemetered deployment %s' % grp[0]) deployments.append(flort_wfp(grp[1])) deployments = [i for i in deployments if i] telem = xr.concat(deployments, 'time') print( '##### Downloading the recovered_wfp FLORT data for %s #####' % site) rhost = load_gc_thredds(site, node, sensor, 'recovered_wfp', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(rhost.groupby('deployment')) for grp in grps: print('# -- Processing recovered_host deployment %s' % grp[0]) deployments.append(flort_wfp(grp[1])) deployments = [i for i in deployments if i] rhost = xr.concat(deployments, 'time') # merge, but do not resample the time records. merged = combine_datasets(telem, rhost, None, None) elif node == 'SBD17': # this FLORT is mounted on the buoy of the Inshore moorings and includes all three types of data print('##### Downloading the telemetered FLORT data for %s #####' % site) telem = load_gc_thredds(site, node, sensor, 'telemetered', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(telem.groupby('deployment')) for grp in grps: print('# -- Processing telemetered deployment %s' % grp[0]) deployments.append(flort_instrument(grp[1])) deployments = [i for i in deployments if i] telem = xr.concat(deployments, 'time') print('##### Downloading the recovered_host FLORT data for %s #####' % site) rhost = load_gc_thredds(site, node, sensor, 'recovered_host', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(rhost.groupby('deployment')) for grp in grps: print('# -- Processing recovered_host deployment %s' % grp[0]) deployments.append(flort_instrument(grp[1])) deployments = [i for i in deployments if i] rhost = xr.concat(deployments, 'time') print('##### Downloading the recovered_inst FLORT data for %s #####' % site) rinst = load_gc_thredds(site, node, sensor, 'recovered_inst', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(rinst.groupby('deployment')) for grp in grps: print('# -- Processing recovered_inst deployment %s' % grp[0]) deployments.append(flort_instrument(grp[1])) deployments = [i for i in deployments if i] rinst = xr.concat(deployments, 'time') # merge and resample to a 2 hour data record merged = combine_datasets(telem, rhost, rinst, 120) else: # this FLORT is standalone on one of the NSIFs and includes the telemetered and recovered_host data # data is collected in bursts (3 minutes at 1 Hz). process each data set per-deployment print('##### Downloading the telemetered FLORT data for %s #####' % site) telem = load_gc_thredds(site, node, sensor, 'telemetered', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(telem.groupby('deployment')) for grp in grps: print('# -- Processing telemetered deployment %s' % grp[0]) deployments.append(flort_datalogger(grp[1], True)) deployments = [i for i in deployments if i] telem = xr.concat(deployments, 'time') print('##### Downloading the recovered_host FLORT data for %s #####' % site) rhost = load_gc_thredds(site, node, sensor, 'recovered_host', stream, tag) deployments = [] print('# -- Group the data by deployment and process the data') grps = list(rhost.groupby('deployment')) for grp in grps: print('# -- Processing recovered_host deployment %s' % grp[0]) deployments.append(flort_datalogger(grp[1], True)) deployments = [i for i in deployments if i] rhost = xr.concat(deployments, 'time') # combine the datasets, leaving them as 15-minute median averaged datasets merged = combine_datasets(telem, rhost, None, None) return merged
def generate_qartod(site, node, sensor, cut_off): """ Load all of the pCO2 data for a defined reference designator (using the site, node and sensor names to construct the reference designator) collected via the recovered instrument method and combine them into a single data set from which QARTOD test limits for the gross range and climatology tests can be calculated. :param site: Site designator, extracted from the first part of the reference designator :param node: Node designator, extracted from the second part of the reference designator :param sensor: Sensor designator, extracted from the third and fourth part of the reference designator :param cut_off: string formatted date to use as cut-off for data to add to QARTOD test sets :return annotations: Initial list of auto-generated HITL annotations as a pandas dataframe :return gr_lookup: CSV formatted strings to save to a csv file for the QARTOD gross range lookup tables. :return clm_lookup: CSV formatted strings to save to a csv file for the QARTOD climatology lookup tables. :return clm_table: CSV formatted strings to save to a csv file for the QARTOD climatology range tables. """ # load the recovered instrument data data = load_gc_thredds(site, node, sensor, 'recovered_inst', 'pco2w_abc_instrument', '^(?!.*blank).*PCO2W.*nc$') data = pco2w_instrument(data) # resample the data into a 3 hour, median averaged time series data = combine_datasets(data, None, None, 180) # recalculate the quality flags as averaging will alter them data['pco2_seawater_quality_flag'] = quality_checks(data) # create a boolean array of the data marked as "fail" by the pCO2 quality checks and generate initial # HITL annotations that can be combined with system annotations and pCO2 quality checks to create # a cleaned up data set prior to calculating the QARTOD test values fail = data.pco2_seawater_quality_flag.where( data.pco2_seawater_quality_flag == 4).notnull() blocks = identify_blocks(fail, [24, 96]) hitl = create_annotations(site, node, sensor, blocks) # get the current system annotations for the sensor annotations = get_annotations(site, node, sensor) annotations = pd.DataFrame(annotations) if not annotations.empty: annotations = annotations.drop(columns=['@class']) annotations['beginDate'] = pd.to_datetime( annotations.beginDT, unit='ms').dt.strftime('%Y-%m-%dT%H:%M:%S') annotations['endDate'] = pd.to_datetime( annotations.endDT, unit='ms').dt.strftime('%Y-%m-%dT%H:%M:%S') # append the fail annotations to the existing annotations annotations = annotations.append(pd.DataFrame(hitl), ignore_index=True, sort=False) # create a roll-up annotation flag data = add_annotation_qc_flags(data, annotations) # clean-up the data, removing values that fail the pCO2 quality checks or were marked as fail in the annotations data = data.where((data.pco2_seawater_quality_flag != 4) & (data.rollup_annotations_qc_results != 4)) # if a cut_off date was used, limit data to all data collected up to the cut_off date. # otherwise, set the limit to the range of the downloaded data. if cut_off: cut = parser.parse(cut_off) cut = cut.astimezone(pytz.utc) end_date = cut.strftime('%Y-%m-%dT%H:%M:%S') src_date = cut.strftime('%Y-%m-%d') else: cut = parser.parse(data.time_coverage_end) cut = cut.astimezone(pytz.utc) end_date = cut.strftime('%Y-%m-%dT%H:%M:%S') src_date = cut.strftime('%Y-%m-%d') data = data.sel(time=slice('2014-01-01T00:00:00', end_date)) # create the initial gross range entry gr = process_gross_range(data, ['pco2_seawater'], [200, 2000], site=site, node=node, sensor=sensor) # re-work gross entry for the different streams and parameter names gr_lookup = pd.DataFrame() gr_lookup = gr_lookup.append([gr, gr, gr], ignore_index=True) gr_lookup['parameter'][0] = {'inp': 'pco2_seawater'} gr_lookup['stream'][0] = 'pco2w_abc_dcl_instrument' gr_lookup['parameter'][1] = {'inp': 'pco2_seawater'} gr_lookup['stream'][1] = 'pco2w_abc_dcl_instrument_recovered' gr_lookup['parameter'][2] = {'inp': 'pco2_seawater'} gr_lookup['stream'][2] = 'pco2w_abc_instrument' gr_lookup['source'] = ( 'Sensor min/max based on the vendor standard calibration range. ' 'The user min/max is the historical mean of all data collected ' 'up to {} +/- 3 standard deviations.'.format(src_date)) # create and format the climatology entry and table cll, clm_table = process_climatology(data, ['pco2_seawater'], [200, 2000], site=site, node=node, sensor=sensor) # re-work climatology entry for the different streams and parameter names clm_lookup = pd.DataFrame() clm_lookup = clm_lookup.append([cll, cll, cll]) clm_lookup['parameters'][0] = { 'inp': 'pco2_seawater', 'tinp': 'time', 'zinp': 'None' } clm_lookup['stream'][0] = 'pco2w_abc_dcl_instrument' clm_lookup['parameters'][1] = { 'inp': 'pco2_seawater', 'tinp': 'time', 'zinp': 'None' } clm_lookup['stream'][1] = 'pco2w_abc_dcl_instrument_recovered' clm_lookup['parameters'][2] = { 'inp': 'pco2_seawater', 'tinp': 'time', 'zinp': 'None' } clm_lookup['stream'][2] = 'pco2w_abc_instrument' return annotations, gr_lookup, clm_lookup, clm_table