def create_or_open_qc_file(qc_file_name, parent_dimensions): """ Given a name of a QC file, attempt to open it. If it exists, check that it has the same dimensions as "dimensions", primarily taken from the raw parent NetCDF dataset and then open it. If it does not, create a new NetCDF file with the specified dimensions. Returns an opened netCDF Dataset object. :param qc_file_name str: The path of the file to open or create :param dimensions list: A list of tuples with the dimensions """ if os.path.exists(qc_file_name): try: ncfile = Dataset(qc_file_name, 'a') if (extract_dimensions(ncfile.dimensions) != extract_dimensions(parent_dimensions)): raise ValueError( "File {} had dimensional mismatch with original data dimensions, recreating QC file" .format(qc_file_name)) else: return ncfile # if an exception is returned, log it and fall through to write the # QC file from scratch except: get_logger().exception("Error during handling of QC file") # if we reached here, either the qc file didn't exist or ran into an error # while trying to be opened, so attempt to create the file from scratch ncfile = Dataset(qc_file_name, 'w') # create dimensions for d in six.itervalues(parent_dimensions): ncfile.createDimension(d.name, d.size) return ncfile
def check_if_qc_vars_exist(file_path, qc_varnames, qc_varnames_bkp): """ Checks that QC variables exist in the corresponding QC file based on data file's filename. Returns False if not all the QC variables are present, and True if they are. """ qc_filepath = file_path.rsplit('.', 1)[0] + '.ncq' # try to fetch the QC file's variable names. If it does not # exist, no QC has been applied and it must be created later if not os.path.exists(qc_filepath): return False else: try: with Dataset(qc_filepath) as f: qc_vars = f.variables.keys() # check if all the QC variables exist in the file. # if they don't, add them to the list of files to be processed return (qc_varnames.issubset(qc_vars) or qc_varnames_bkp.issubset(qc_vars)) # if for some reason we can't open the file, # note the exception and treat the qc variables as empty except: get_logger().exception( 'Failed to open file {}'.format(qc_filepath)) return False
def get_unmasked(self, ncvariable): times = self.ncfile.variables['time'][:] values = ncvariable[:] mask = np.zeros(times.shape[0], dtype=bool) if hasattr(values, 'mask'): mask |= values.mask if hasattr(times, 'mask'): mask |= times.mask values_initial = ma.getdata(values[~mask]) config = self.get_config(ncvariable.name) units = getattr(ncvariable, 'units', '1') # If units are not defined or empty, set them to unitless # If the config units are empty, do not attempt to convert units # The latter is necessary as some of the NetCDF files do not have # units attribute under the udunits variable definitions if not units or pd.isnull(config.units): units = '1' values = values_initial # must be a CF unit or this will throw an exception elif ncvariable.units != config.units: try: values = Unit(units).convert(values_initial, config.units) except ValueError as e: exc_text = "Caught exception while converting units: {}".format( e) get_logger().warn(exc_text) values = values_initial return times, values, mask
def load_config(self, path): ''' Returns a dataframe loaded from the excel config file. ''' get_logger().info("Loading config %s", path) df = pd.read_excel(path) self.config = df return df
def apply_qc(self, ncvariable): ''' Applies QC to a qartod variable :param netCDF4.Variable ncvariable: A QARTOD Variable ''' qc_tests = { 'flat_line': qc.flat_line_check, 'gross_range': qc.range_check, 'rate_of_change': qc.rate_of_change_check, 'spike': qc.spike_check, 'pressure': gliders_qc.pressure_check } # If the qartod_test attribute isn't defined then this isn't a variable # this script created and is not eligble for automatic QC qartod_test = getattr(ncvariable, 'qartod_test', None) if not qartod_test: return # Get a reference to the parent variable using the standard_name attribute standard_name = getattr(ncvariable, 'standard_name').split(' ')[0] parent = self.ncfile.get_variables_by_attributes( standard_name=standard_name)[0] test_params = self.get_test_params(parent.name) # If there is no parameters defined for this test, don't apply QC if qartod_test not in test_params: return test_params = test_params[qartod_test] if 'thresh_val' in test_params: test_params['thresh_val'] = test_params['thresh_val'] / pq.hour times, values, mask = self.get_unmasked(parent) if qartod_test == 'rate_of_change': times = ma.getdata(times[~mask]) dates = np.array(num2date(times, self.ncfile.variables['time'].units), dtype='datetime64[ms]') test_params['times'] = dates if qartod_test == 'pressure': test_params['pressure'] = values else: test_params['arr'] = values qc_flags = qc_tests[qartod_test](**test_params) get_logger().info("Flagged: %s", len(np.where(qc_flags == 4)[0])) get_logger().info("Total Values: %s", len(values)) ncvariable[~mask] = qc_flags
def find_geophysical_variables(self): ''' Returns a list of variables that match any variables listed in the config file for this station ''' variables = [] station_name = self.find_station_name() station_id = station_name.split(':')[-1] get_logger().info("Station ID: %s", station_id) local_config = self.config[self.config['station_id'] == station_id] configured_variables = local_config.variable.tolist() get_logger().info("Configured variables: %s", ', '.join(configured_variables)) for variable in self.ncfile.variables: if variable in configured_variables: variables.append(variable) return variables
def find_files(dest_dir, qc_varnames, qc_varnames_bkp): nc_files = [] if os.path.exists(dest_dir): for root, subdir, fname in os.walk(dest_dir): # find all .nc files in this directory, check if there has been # qc applied to them, and create absolute paths to them # path to them if not fname.endswith('.nc'): continue full_path = os.path.join(root, fname) if not check_if_qc_vars_exist(full_path, qc_varnames, qc_varnames_bkp): nc_files.append(full_path) else: get_logger().warn( "Directory '{}' does not exist but was referenced in config". format(dest_dir)) return nc_files
def find_geophysical_variables(self): ''' Returns a list of variables that match any variables listed in the config file for this station ''' variables = [] station_name = self.find_station_name() station_id = station_name.split(':')[-1] get_logger().info("Station ID: %s", station_id) local_config = self.config[self.config['station_id'].astype(str) == station_id] # get remaining "all" config univ_config = self.config[(self.config['station_id'] == '*') & ( ~self.config['variable'].isin(local_config['variable']))] # switch over to normal station ID univ_config.loc[:, 'station_id'] = station_id config_all = pd.concat([local_config, univ_config]) configured_variables = config_all.variable.tolist() get_logger().info("Configured variables: %s", ', '.join(configured_variables)) return set(configured_variables).intersection(self.ncfile.variables)
def main(): ''' Apply QARTOD QC to GliderDAC submitted netCDF files ''' parser = ArgumentParser(description=main.__doc__) parser.add_argument('-c', '--config', help='Path to config YML file to use') parser.add_argument('-v', '--verbose', action='store_true', help='Turn on logging') parser.add_argument('netcdf_files', nargs='+', help='NetCDF file to apply QC to') args = parser.parse_args() if args.verbose: setup_logging() get_logger().info("Loading config %s", args.config) config = pd.read_excel(args.config) for nc_file in args.netcdf_files: with Dataset(nc_file, 'r') as nc: run_qc(config, nc)
def run_qc(config, ncfile): ''' Runs QC on a netCDF file ''' qc = DatasetQC(ncfile, config) for varname in qc.find_geophysical_variables(): get_logger().info("Applying QC to %s", varname) ncvar = ncfile.variables[varname] for qcvarname in qc.create_qc_variables(ncvar): qcvar = ncfile.variables[qcvarname] get_logger().info(qcvarname) qc.apply_qc(qcvar) get_logger().info("Primary QC") qc.apply_primary_qc(ncvar)
def run_qc(config, ncfile, qc_extension='ncq'): ''' Runs QC on a netCDF file Takes a path to an Excel file to be read or pandas DataFrame containing QC configuration and applies the QC to the file at `filepath`. Creates a file with the same base name as filepath, but with the file extension specified by `qc_extension`. :param config: str or pandas.DataFrame :param ncfile: netCDF4.Dataset :param qc_extension: str ''' fname_base = ncfile.filepath().rsplit('.', 1)[0] qc_filename = "{}.{}".format(fname_base, qc_extension) # Look for existing qc_file or return new one qc_file = create_or_open_qc_file(qc_filename, ncfile.dimensions) # load NcML aggregation if it exists ncml_filename = fname_base + '.ncml' qc = DatasetQC(ncfile, qc_file, ncml_filename, config) # zero length times will throw an IndexError in the netCDF interface, # and won't result in any QC being applied anyways, so skip them if present if qc.ncfile.variables['time'].size > 0: for varname in qc.find_geophysical_variables(): get_logger().info("Applying QC to %s", varname) ncvar = ncfile.variables[varname] for qcvarname in qc.create_qc_variables(ncvar): qcvar = qc_file.variables[qcvarname] get_logger().info(qcvarname) qc.apply_qc(qcvar) get_logger().info("Primary QC") qc.apply_primary_qc(ncvar) # if there were changes in the ncml file, write them if qc.ncml_write_flag: with open(ncml_filename, 'w') as ncml_file: ncml_file.write(etree.tostring(qc.ncml)) qc_file.close()