def check_files(file_list, site_code, parameter_names_accepted, input_dir=''): """ Return a chronologically sorted file_list and a dictionary if the file fails one or more of the tests :param file_list: list or file URLs :param site_code: code of the mooring site :param parameter_names_accepted: list of names of accepted parameters :param input_dir: base path where source files are stored :return: dictionary with the file name and list of failed tests, list good files chronologically ordered """ file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"]) error_dict = {} for file in file_list: with xr.open_dataset(os.path.join(input_dir, file)) as nc: error_list = check_file(nc, site_code, parameter_names_accepted) if error_list: error_dict.update({file: error_list}) else: file_list_dataframe = file_list_dataframe.append({'url': file, 'deployment_date': parse(nc.time_deployment_start)}, ignore_index=True) file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date') file_list = file_list_dataframe['url'].to_list() if file_list == []: raise NoInputFilesError("no valid input files to aggregate") return file_list, error_dict
def test_am_file(self): with xr.open_dataset(AM_FILE) as nc: error_list = check_file(nc, 'NRSMAI', 'TEMP') self.assertEqual( set(error_list), { 'no NOMINAL_DEPTH', 'no time_deployment_start attribute', 'no time_deployment_end attribute', 'unexpected quality_control_conventions: "WOCE quality control procedure"' })
def test_bad_temp_file(self): with xr.open_dataset(BAD_TZ_FILE) as nc: error_list = check_file(nc, 'NRSROT', 'TEMP') self.assertEqual( set(error_list), { 'no NOMINAL_DEPTH', 'Wrong file version: Level 0 - Raw Data', 'no time_deployment_start attribute', 'no time_deployment_end attribute' })
def test_wrong_site_and_var(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: error_list = check_file(nc, 'NO_SITE', 'OTHER') self.assertEqual( set(error_list), {'Wrong site_code: NRSROT', 'no variables to aggregate'})
def test_variable_list(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: error_list = check_file(nc, 'NRSROT', ['TEMP', 'PSAL', 'DEPTH']) self.assertEqual(error_list, [])
def test_good_temp_file(self): with xr.open_dataset(GOOD_TZ_FILE) as nc: error_list = check_file(nc, 'NRSROT', 'TEMP') self.assertEqual(error_list, [])
def main_aggregator(files_to_agg, var_to_agg, site_code, input_dir='', output_dir='./', download_url_prefix=None, opendap_url_prefix=None): """ Aggregate the Variable of Interest (VoI) from all deployments at one site. additional metadata variables are stored to track the origin of the data :param files_to_agg: List of files to aggregate. Each path is interpreted relative to input_dir (if specified). These relative paths are listed in the `source_files` variable in the output file. :param site_code: site code :param var_to_agg: Variable of Interest :param input_dir: base path where source files are stored :param output_dir: path where the result file will be written :param download_url_prefix: URL prefix for file download (to be prepended to paths in files_to_agg) :param opendap_url_prefix: URL prefix for OPENAP access (to be prepended to paths in files_to_agg) :return: name of the resulting file, list of rejected files """ time_units="days since 1950-01-01 00:00:00 UTC" time_calendar="gregorian" epoch = np.datetime64("1950-01-01T00:00:00") one_day = np.timedelta64(1, 'D') bad_files = {} rejected_files = [] # default name for temporary file. It will be renamed at the end _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir) ## check files and get total number of flattened obs n_obs_total = 0 for file in files_to_agg: with xr.open_dataset(os.path.join(input_dir, file)) as nc: error_list = check_file(nc, site_code, var_to_agg) if not error_list: nc = in_water(nc) n_obs_total += len(nc.TIME) else: bad_files.update({file: error_list}) rejected_files.append(file) ## remove bad files form the list and sort in chronological order for file in bad_files.keys(): files_to_agg.remove(file) if len(files_to_agg) == 0: raise NoInputFilesError("no valid input files to aggregate") files_to_agg = sort_files(files_to_agg, input_dir=input_dir) n_files = len(files_to_agg) ## create ncdf file, dimensions and variables ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC') OBSERVATION = ds.createDimension('OBSERVATION', size=n_obs_total) INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files) STRING256 = ds.createDimension("strlen", 256) obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0} obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION',), "fill_value": 99999.0} obs_byte_template = {'datatype': np.byte, 'zlib': True, 'dimensions': ('OBSERVATION',), 'fill_value': 99} obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION',)} inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")} inst_float_template = {'datatype': np.float32, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0} inst_double_template = {'datatype': np.float64, 'dimensions': ('INSTRUMENT',), "fill_value": 99999.0} agg_variable = ds.createVariable(varname=var_to_agg, **obs_float_template) agg_variable_qc = ds.createVariable(varname=var_to_agg + '_quality_control', **obs_byte_template) DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template) DEPTHqc = ds.createVariable(varname='DEPTH_quality_control', **obs_byte_template) PRES = ds.createVariable(varname='PRES', **obs_float_template) PRESqc = ds.createVariable(varname='PRES_quality_control', **obs_byte_template) PRES_REL = ds.createVariable(varname='PRES_REL', **obs_float_template) PRES_RELqc = ds.createVariable(varname='PRES_REL_quality_control', **obs_byte_template) TIME = ds.createVariable(varname='TIME', **obs_double_template) instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template) source_file = ds.createVariable(varname='source_file', **inst_S256_template) instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template) LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template) LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template) NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template) ## main loop start = 0 for index, file in enumerate(files_to_agg): with xr.open_dataset(os.path.join(input_dir, file)) as nc: nc = in_water(nc) n_obs = len(nc.TIME) end = start + n_obs agg_variable[start:end], agg_variable_qc[start:end] = get_variable_values(nc, var_to_agg) DEPTH[start:end], DEPTHqc[start:end] = get_variable_values(nc, 'DEPTH') PRES[start:end], PRESqc[start:end] = get_variable_values(nc, 'PRESS') PRES_REL[start:end], PRES_RELqc[start:end] = get_variable_values(nc, 'PRESS_REL') ## set TIME and instrument index TIME[start:end] = (nc.TIME.values - epoch) / one_day instrument_index[start:end] = np.repeat(index, n_obs) ## get and store deployment metadata LATITUDE[index] = nc.LATITUDE.values LONGITUDE[index] = nc.LONGITUDE.values NOMINAL_DEPTH[index] = get_nominal_depth(nc) source_file[index] = stringtochar(np.array(file, dtype='S256')) instrument_id[index] = stringtochar(np.array(get_instrument_id(nc), dtype='S256')) start = end ## add atributes with open(TEMPLATE_JSON) as json_file: attribute_dictionary = json.load(json_file) variable_attribute_dictionary = attribute_dictionary['_variables'] global_attribute_dictionary = attribute_dictionary['_global'] ## set variable attrs for var in list(ds.variables): ds[var].setncatts(variable_attribute_dictionary[var]) if download_url_prefix or opendap_url_prefix: ds['source_file'].setncatts(source_file_attributes(download_url_prefix, opendap_url_prefix)) ## set global attrs timeformat = '%Y-%m-%dT%H:%M:%SZ' file_timeformat = '%Y%m%d' time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(timeformat) time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(timeformat) time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(file_timeformat) time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(file_timeformat) add_attribute = { 'title': ("Long Timeseries Velocity Aggregated product: " + var_to_agg + " at " + site_code + " between " + time_start + " and " + time_end), 'site_code': site_code, 'time_coverage_start': time_start, 'time_coverage_end': time_end, 'geospatial_vertical_min': np.min(ds['DEPTH']), 'geospatial_vertical_max': np.max(ds['DEPTH']), 'geospatial_lat_min': np.min(ds['LATITUDE']), 'geospatial_lat_max': np.max(ds['LATITUDE']), 'geospatial_lon_min': np.min(ds['LONGITUDE']), 'geospatial_lon_max': np.max(ds['LONGITUDE']), 'date_created': datetime.utcnow().strftime(timeformat), 'history': datetime.utcnow().strftime(timeformat) + ': Aggregated file created.', 'keywords': ', '.join([var_to_agg, 'AGGREGATED']), 'rejected_files': "\n".join(rejected_files), 'generating_code_version': __version__} add_attribute.update(get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)) github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/' '{v}/aodntools/timeseries_products/aggregated_timeseries.py'.format(v=__version__) ) global_attribute_dictionary['lineage'] += github_comment global_attribute_dictionary.update(add_attribute) ds.setncatts(dict(sorted(global_attribute_dictionary.items()))) ds.close() ## create the output file name and rename the tmp file facility_code = get_facility_code(os.path.join(input_dir, files_to_agg[0])) data_code = get_data_code(var_to_agg) + 'Z' product_type = 'aggregated-timeseries' file_version = 1 output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)), (var_to_agg + "-" + product_type), ('END-'+ time_end_filename), 'C-' + datetime.utcnow().strftime(file_timeformat)]) + '.nc' ncout_path = os.path.join(output_dir, output_name) shutil.move(temp_outfile, os.path.join(output_dir, ncout_path)) return ncout_path, bad_files