Beispiel #1
0
def create_or_open_qc_file(qc_file_name, parent_dimensions):
    """
    Given a name of a QC file, attempt to open it.  If it exists, check
    that it has the same dimensions as "dimensions", primarily taken from
    the raw parent NetCDF dataset and then open it.  If it does not, create a
    new NetCDF file with the specified dimensions.  Returns an opened netCDF
    Dataset object.

    :param qc_file_name str: The path of the file to open or create
    :param dimensions list: A list of tuples with the dimensions
    """
    if os.path.exists(qc_file_name):
        try:
            ncfile = Dataset(qc_file_name, 'a')
            if (extract_dimensions(ncfile.dimensions) !=
                    extract_dimensions(parent_dimensions)):
                raise ValueError(
                    "File {} had dimensional mismatch with original data dimensions, recreating QC file"
                    .format(qc_file_name))
            else:
                return ncfile
            # if an exception is returned, log it and fall through to write the
            # QC file from scratch
        except:
            get_logger().exception("Error during handling of QC file")

    # if we reached here, either the qc file didn't exist or ran into an error
    # while trying to be opened, so attempt to create the file from scratch
    ncfile = Dataset(qc_file_name, 'w')

    # create dimensions
    for d in six.itervalues(parent_dimensions):
        ncfile.createDimension(d.name, d.size)

    return ncfile
Beispiel #2
0
def check_if_qc_vars_exist(file_path, qc_varnames, qc_varnames_bkp):
    """
    Checks that QC variables exist in the corresponding QC file based on
    data file's filename.
    Returns False if not all the QC variables are present, and True
    if they are.
    """

    qc_filepath = file_path.rsplit('.', 1)[0] + '.ncq'
    # try to fetch the QC file's variable names.  If it does not
    # exist, no QC has been applied and it must be created later
    if not os.path.exists(qc_filepath):
        return False
    else:
        try:
            with Dataset(qc_filepath) as f:
                qc_vars = f.variables.keys()
            # check if all the QC variables exist in the file.
            # if they don't, add them to the list of files to be processed
            return (qc_varnames.issubset(qc_vars)
                    or qc_varnames_bkp.issubset(qc_vars))
        # if for some reason we can't open the file,
        # note the exception and treat the qc variables as empty
        except:
            get_logger().exception(
                'Failed to open file {}'.format(qc_filepath))
            return False
Beispiel #3
0
    def get_unmasked(self, ncvariable):
        times = self.ncfile.variables['time'][:]
        values = ncvariable[:]

        mask = np.zeros(times.shape[0], dtype=bool)

        if hasattr(values, 'mask'):
            mask |= values.mask

        if hasattr(times, 'mask'):
            mask |= times.mask

        values_initial = ma.getdata(values[~mask])
        config = self.get_config(ncvariable.name)
        units = getattr(ncvariable, 'units', '1')
        # If units are not defined or empty, set them to unitless
        # If the config units are empty, do not attempt to convert units
        # The latter is necessary as some of the NetCDF files do not have
        # units attribute under the udunits variable definitions
        if not units or pd.isnull(config.units):
            units = '1'
            values = values_initial
        # must be a CF unit or this will throw an exception
        elif ncvariable.units != config.units:
            try:
                values = Unit(units).convert(values_initial, config.units)
            except ValueError as e:
                exc_text = "Caught exception while converting units: {}".format(
                    e)
                get_logger().warn(exc_text)
                values = values_initial
        return times, values, mask
Beispiel #4
0
 def load_config(self, path):
     '''
     Returns a dataframe loaded from the excel config file.
     '''
     get_logger().info("Loading config %s", path)
     df = pd.read_excel(path)
     self.config = df
     return df
Beispiel #5
0
    def apply_qc(self, ncvariable):
        '''
        Applies QC to a qartod variable

        :param netCDF4.Variable ncvariable: A QARTOD Variable
        '''
        qc_tests = {
            'flat_line': qc.flat_line_check,
            'gross_range': qc.range_check,
            'rate_of_change': qc.rate_of_change_check,
            'spike': qc.spike_check,
            'pressure': gliders_qc.pressure_check
        }

        # If the qartod_test attribute isn't defined then this isn't a variable
        # this script created and is not eligble for automatic QC
        qartod_test = getattr(ncvariable, 'qartod_test', None)
        if not qartod_test:
            return

        # Get a reference to the parent variable using the standard_name attribute
        standard_name = getattr(ncvariable, 'standard_name').split(' ')[0]
        parent = self.ncfile.get_variables_by_attributes(
            standard_name=standard_name)[0]

        test_params = self.get_test_params(parent.name)
        # If there is no parameters defined for this test, don't apply QC
        if qartod_test not in test_params:
            return

        test_params = test_params[qartod_test]

        if 'thresh_val' in test_params:
            test_params['thresh_val'] = test_params['thresh_val'] / pq.hour

        times, values, mask = self.get_unmasked(parent)

        if qartod_test == 'rate_of_change':
            times = ma.getdata(times[~mask])
            dates = np.array(num2date(times,
                                      self.ncfile.variables['time'].units),
                             dtype='datetime64[ms]')
            test_params['times'] = dates

        if qartod_test == 'pressure':
            test_params['pressure'] = values
        else:
            test_params['arr'] = values

        qc_flags = qc_tests[qartod_test](**test_params)
        get_logger().info("Flagged: %s", len(np.where(qc_flags == 4)[0]))
        get_logger().info("Total Values: %s", len(values))
        ncvariable[~mask] = qc_flags
Beispiel #6
0
 def find_geophysical_variables(self):
     '''
     Returns a list of variables that match any variables listed in the
     config file for this station
     '''
     variables = []
     station_name = self.find_station_name()
     station_id = station_name.split(':')[-1]
     get_logger().info("Station ID: %s", station_id)
     local_config = self.config[self.config['station_id'] == station_id]
     configured_variables = local_config.variable.tolist()
     get_logger().info("Configured variables: %s",
                       ', '.join(configured_variables))
     for variable in self.ncfile.variables:
         if variable in configured_variables:
             variables.append(variable)
     return variables
Beispiel #7
0
def find_files(dest_dir, qc_varnames, qc_varnames_bkp):
    nc_files = []
    if os.path.exists(dest_dir):
        for root, subdir, fname in os.walk(dest_dir):
            # find all .nc files in this directory, check if there has been
            # qc applied to them, and create absolute paths to them
            # path to them
            if not fname.endswith('.nc'):
                continue

            full_path = os.path.join(root, fname)
            if not check_if_qc_vars_exist(full_path, qc_varnames,
                                          qc_varnames_bkp):
                nc_files.append(full_path)
    else:
        get_logger().warn(
            "Directory '{}' does not exist but was referenced in config".
            format(dest_dir))

    return nc_files
Beispiel #8
0
    def find_geophysical_variables(self):
        '''
        Returns a list of variables that match any variables listed in the
        config file for this station
        '''
        variables = []
        station_name = self.find_station_name()
        station_id = station_name.split(':')[-1]
        get_logger().info("Station ID: %s", station_id)
        local_config = self.config[self.config['station_id'].astype(str) ==
                                   station_id]
        # get remaining "all" config
        univ_config = self.config[(self.config['station_id'] == '*') & (
            ~self.config['variable'].isin(local_config['variable']))]
        # switch over to normal station ID
        univ_config.loc[:, 'station_id'] = station_id
        config_all = pd.concat([local_config, univ_config])

        configured_variables = config_all.variable.tolist()
        get_logger().info("Configured variables: %s",
                          ', '.join(configured_variables))
        return set(configured_variables).intersection(self.ncfile.variables)
Beispiel #9
0
def main():
    '''
    Apply QARTOD QC to GliderDAC submitted netCDF files
    '''
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument('-c',
                        '--config',
                        help='Path to config YML file to use')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='Turn on logging')
    parser.add_argument('netcdf_files',
                        nargs='+',
                        help='NetCDF file to apply QC to')

    args = parser.parse_args()
    if args.verbose:
        setup_logging()
    get_logger().info("Loading config %s", args.config)
    config = pd.read_excel(args.config)
    for nc_file in args.netcdf_files:
        with Dataset(nc_file, 'r') as nc:
            run_qc(config, nc)
Beispiel #10
0
def run_qc(config, ncfile):
    '''
    Runs QC on a netCDF file
    '''
    qc = DatasetQC(ncfile, config)
    for varname in qc.find_geophysical_variables():
        get_logger().info("Applying QC to %s", varname)
        ncvar = ncfile.variables[varname]
        for qcvarname in qc.create_qc_variables(ncvar):
            qcvar = ncfile.variables[qcvarname]
            get_logger().info(qcvarname)
            qc.apply_qc(qcvar)
        get_logger().info("Primary QC")
        qc.apply_primary_qc(ncvar)
Beispiel #11
0
def run_qc(config, ncfile, qc_extension='ncq'):
    '''
    Runs QC on a netCDF file

    Takes a path to an Excel file to be read or pandas DataFrame containing
    QC configuration and applies the QC to the file at `filepath`.
    Creates a file with the same base name as filepath, but with the file
    extension specified by `qc_extension`.

    :param config: str or pandas.DataFrame
    :param ncfile: netCDF4.Dataset
    :param qc_extension: str
    '''
    fname_base = ncfile.filepath().rsplit('.', 1)[0]
    qc_filename = "{}.{}".format(fname_base, qc_extension)
    # Look for existing qc_file or return new one
    qc_file = create_or_open_qc_file(qc_filename, ncfile.dimensions)
    # load NcML aggregation if it exists
    ncml_filename = fname_base + '.ncml'
    qc = DatasetQC(ncfile, qc_file, ncml_filename, config)
    # zero length times will throw an IndexError in the netCDF interface,
    # and won't result in any QC being applied anyways, so skip them if present
    if qc.ncfile.variables['time'].size > 0:
        for varname in qc.find_geophysical_variables():
            get_logger().info("Applying QC to %s", varname)
            ncvar = ncfile.variables[varname]
            for qcvarname in qc.create_qc_variables(ncvar):
                qcvar = qc_file.variables[qcvarname]
                get_logger().info(qcvarname)
                qc.apply_qc(qcvar)
            get_logger().info("Primary QC")
            qc.apply_primary_qc(ncvar)
    # if there were changes in the ncml file, write them
    if qc.ncml_write_flag:
        with open(ncml_filename, 'w') as ncml_file:
            ncml_file.write(etree.tostring(qc.ncml))
    qc_file.close()