def check_files(file_list, site_code, parameter_names_accepted, input_dir=''):
    """
    Return a chronologically sorted file_list and a dictionary if the file fails one or more of the tests

    :param file_list: list or file URLs
    :param site_code: code of the mooring site
    :param parameter_names_accepted: list of names of accepted parameters
    :param input_dir: base path where source files are stored
    :return: dictionary with the file name and list of failed tests, list good files chronologically ordered
    """

    file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"])
    error_dict = {}

    for file in file_list:
        with xr.open_dataset(os.path.join(input_dir, file)) as nc:
            error_list = check_file(nc, site_code, parameter_names_accepted)
            if error_list:
                error_dict.update({file: error_list})
            else:
                file_list_dataframe = file_list_dataframe.append({'url': file,
                                                                  'deployment_date': parse(nc.time_deployment_start)},
                                                                 ignore_index=True)

    file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date')
    file_list = file_list_dataframe['url'].to_list()
    if file_list == []:
        raise NoInputFilesError("no valid input files to aggregate")

    return file_list, error_dict
Ejemplo n.º 2
0
 def test_am_file(self):
     with xr.open_dataset(AM_FILE) as nc:
         error_list = check_file(nc, 'NRSMAI', 'TEMP')
     self.assertEqual(
         set(error_list), {
             'no NOMINAL_DEPTH', 'no time_deployment_start attribute',
             'no time_deployment_end attribute',
             'unexpected quality_control_conventions: "WOCE quality control procedure"'
         })
Ejemplo n.º 3
0
 def test_bad_temp_file(self):
     with xr.open_dataset(BAD_TZ_FILE) as nc:
         error_list = check_file(nc, 'NRSROT', 'TEMP')
     self.assertEqual(
         set(error_list), {
             'no NOMINAL_DEPTH', 'Wrong file version: Level 0 - Raw Data',
             'no time_deployment_start attribute',
             'no time_deployment_end attribute'
         })
Ejemplo n.º 4
0
 def test_wrong_site_and_var(self):
     with xr.open_dataset(GOOD_TZ_FILE) as nc:
         error_list = check_file(nc, 'NO_SITE', 'OTHER')
     self.assertEqual(
         set(error_list),
         {'Wrong site_code: NRSROT', 'no variables to aggregate'})
Ejemplo n.º 5
0
 def test_variable_list(self):
     with xr.open_dataset(GOOD_TZ_FILE) as nc:
         error_list = check_file(nc, 'NRSROT', ['TEMP', 'PSAL', 'DEPTH'])
     self.assertEqual(error_list, [])
Ejemplo n.º 6
0
 def test_good_temp_file(self):
     with xr.open_dataset(GOOD_TZ_FILE) as nc:
         error_list = check_file(nc, 'NRSROT', 'TEMP')
     self.assertEqual(error_list, [])
def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_dir='./',
                    download_url_prefix=None, opendap_url_prefix=None):
    """
    Aggregate the Variable of Interest (VoI) from all deployments at one site.
    additional metadata variables are stored to track the origin of the data
    :param files_to_agg: List of files to aggregate. Each path is interpreted relative to input_dir (if specified).
                         These relative paths are listed in the `source_files` variable in the output file.
    :param site_code: site code
    :param var_to_agg: Variable of Interest
    :param input_dir: base path where source files are stored
    :param output_dir: path where the result file will be written
    :param download_url_prefix: URL prefix for file download (to be prepended to paths in files_to_agg)
    :param opendap_url_prefix: URL prefix for OPENAP access (to be prepended to paths in files_to_agg)
    :return: name of the resulting file, list of rejected files
    """

    time_units="days since 1950-01-01 00:00:00 UTC"
    time_calendar="gregorian"
    epoch = np.datetime64("1950-01-01T00:00:00")
    one_day = np.timedelta64(1, 'D')

    bad_files = {}
    rejected_files = []

    # default name for temporary file. It will be renamed at the end
    _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir)

    ## check files and get total number of flattened obs
    n_obs_total = 0
    for file in files_to_agg:
        with xr.open_dataset(os.path.join(input_dir, file)) as nc:

            error_list = check_file(nc, site_code, var_to_agg)
            if not error_list:
                nc = in_water(nc)
                n_obs_total += len(nc.TIME)
            else:
                bad_files.update({file: error_list})
                rejected_files.append(file)

    ## remove bad files form the list and sort in chronological order
    for file in bad_files.keys():
        files_to_agg.remove(file)
    if len(files_to_agg) == 0:
        raise NoInputFilesError("no valid input files to aggregate")
    files_to_agg = sort_files(files_to_agg, input_dir=input_dir)

    n_files = len(files_to_agg)

    ## create ncdf file, dimensions and variables
    ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
    OBSERVATION = ds.createDimension('OBSERVATION', size=n_obs_total)
    INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files)
    STRING256 = ds.createDimension("strlen", 256)


    obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0}
    obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0}
    obs_byte_template = {'datatype': np.byte, 'zlib': True, 'dimensions': ('OBSERVATION',), 'fill_value': 99}
    obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION',)}
    inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")}
    inst_float_template = {'datatype': np.float32, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0}
    inst_double_template = {'datatype': np.float64, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0}

    agg_variable = ds.createVariable(varname=var_to_agg, **obs_float_template)
    agg_variable_qc = ds.createVariable(varname=var_to_agg + '_quality_control', **obs_byte_template)
    DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template)
    DEPTHqc = ds.createVariable(varname='DEPTH_quality_control', **obs_byte_template)
    PRES = ds.createVariable(varname='PRES', **obs_float_template)
    PRESqc = ds.createVariable(varname='PRES_quality_control', **obs_byte_template)
    PRES_REL = ds.createVariable(varname='PRES_REL', **obs_float_template)
    PRES_RELqc = ds.createVariable(varname='PRES_REL_quality_control', **obs_byte_template)

    TIME = ds.createVariable(varname='TIME', **obs_double_template)
    instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template)

    source_file = ds.createVariable(varname='source_file', **inst_S256_template)
    instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template)
    LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template)
    LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template)
    NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template)

    ## main loop
    start = 0
    for index, file in enumerate(files_to_agg):
        with xr.open_dataset(os.path.join(input_dir, file)) as nc:
            nc = in_water(nc)
            n_obs = len(nc.TIME)
            end = start + n_obs
            agg_variable[start:end], agg_variable_qc[start:end] = get_variable_values(nc, var_to_agg)
            DEPTH[start:end], DEPTHqc[start:end] = get_variable_values(nc, 'DEPTH')
            PRES[start:end], PRESqc[start:end] = get_variable_values(nc, 'PRESS')
            PRES_REL[start:end], PRES_RELqc[start:end] = get_variable_values(nc, 'PRESS_REL')

            ## set TIME and instrument index
            TIME[start:end] = (nc.TIME.values - epoch) / one_day
            instrument_index[start:end] = np.repeat(index, n_obs)
            ## get and store deployment metadata
            LATITUDE[index] = nc.LATITUDE.values
            LONGITUDE[index] = nc.LONGITUDE.values
            NOMINAL_DEPTH[index] = get_nominal_depth(nc)
            source_file[index] = stringtochar(np.array(file, dtype='S256'))
            instrument_id[index] = stringtochar(np.array(get_instrument_id(nc), dtype='S256'))

        start = end


    ## add atributes
    with open(TEMPLATE_JSON) as json_file:
        attribute_dictionary = json.load(json_file)
    variable_attribute_dictionary = attribute_dictionary['_variables']
    global_attribute_dictionary = attribute_dictionary['_global']

    ## set variable attrs
    for var in list(ds.variables):
        ds[var].setncatts(variable_attribute_dictionary[var])

    if download_url_prefix or opendap_url_prefix:
        ds['source_file'].setncatts(source_file_attributes(download_url_prefix, opendap_url_prefix))

    ## set global attrs
    timeformat = '%Y-%m-%dT%H:%M:%SZ'
    file_timeformat = '%Y%m%d'

    time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(timeformat)
    time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(timeformat)
    time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
    time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(file_timeformat)

    add_attribute = {
                    'title':                    ("Long Timeseries Velocity Aggregated product: " + var_to_agg + " at " +
                                                 site_code + " between " + time_start + " and " + time_end),
                    'site_code':                site_code,
                    'time_coverage_start':      time_start,
                    'time_coverage_end':        time_end,
                    'geospatial_vertical_min':  np.min(ds['DEPTH']),
                    'geospatial_vertical_max':  np.max(ds['DEPTH']),
                    'geospatial_lat_min':       np.min(ds['LATITUDE']),
                    'geospatial_lat_max':       np.max(ds['LATITUDE']),
                    'geospatial_lon_min':       np.min(ds['LONGITUDE']),
                    'geospatial_lon_max':       np.max(ds['LONGITUDE']),
                    'date_created':             datetime.utcnow().strftime(timeformat),
                    'history':                  datetime.utcnow().strftime(timeformat) + ': Aggregated file created.',
                    'keywords':                 ', '.join([var_to_agg, 'AGGREGATED']),
                    'rejected_files':           "\n".join(rejected_files),
                    'generating_code_version':  __version__}
    add_attribute.update(get_contributors(files_to_agg=files_to_agg, input_dir=input_dir))

    github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/'
                      '{v}/aodntools/timeseries_products/aggregated_timeseries.py'.format(v=__version__)
                      )
    global_attribute_dictionary['lineage'] += github_comment
    global_attribute_dictionary.update(add_attribute)
    ds.setncatts(dict(sorted(global_attribute_dictionary.items())))

    ds.close()

    ## create the output file name and rename the tmp file
    facility_code = get_facility_code(os.path.join(input_dir, files_to_agg[0]))
    data_code = get_data_code(var_to_agg) + 'Z'
    product_type = 'aggregated-timeseries'
    file_version = 1
    output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)),
                            (var_to_agg + "-" + product_type),
                            ('END-'+ time_end_filename), 'C-' + datetime.utcnow().strftime(file_timeformat)]) + '.nc'
    ncout_path = os.path.join(output_dir, output_name)
    shutil.move(temp_outfile, os.path.join(output_dir, ncout_path))

    return ncout_path, bad_files