def parser(self) -> 'Builder': """ Method used to start the parsing process. Parameters ---------- None Returns ------- Builder Returns a Builder object. """ # loop over datasets df_parts = [] entries = defaultdict(dict) # for dataset in input_yaml.keys(): for dataset in self.input_yaml['catalog']: # get a list of keys that are common to all files in the dataset entries['global'] = {} for g in dataset.keys(): if 'data_sources' not in g and 'ensemble' not in g: entries['global'][g] = dataset[g] # loop over ensemble members, if they exist if 'ensemble' in dataset.keys(): for member in dataset['ensemble']: glob_string = member.pop('glob_string') filelist = glob.glob(glob_string) for f in filelist: entries[f].update(member) # loop over all of the data_sources for the dataset, create a dataframe # for each data_source, append that dataframe to a list that will contain # the full dataframe (or catalog) based on everything in the yaml file. for stream_info in dataset['data_sources']: filelist = glob.glob(stream_info['glob_string']) stream_info.pop('glob_string') for f in filelist: entries[f].update(stream_info) partial_parser_netcdf = functools.partial(self._parser_netcdf, local_attrs=entries) self.builder = Builder(None, parser=partial_parser_netcdf, lazy=False) self.builder.filelist = [x for x in entries.keys() if x != 'global'] df_parts.append( self.builder.build('path', 'variable') .df.set_index('path') .apply(lambda x: x.apply(pd.Series).stack()) .reset_index() .drop('level_1', 1) ) # create the combined dataframe from all of the data_sources and datasets from # the yaml file df = pd.concat(df_parts, sort=False) self.builder.df = df.sort_values(by=['path']) return self.builder
def test_builder_init(paths, depth, storage_options, include_patterns, exclude_patterns, num_assets): builder = Builder( paths=paths, depth=depth, storage_options=storage_options, include_patterns=include_patterns, exclude_patterns=exclude_patterns, ) builder.get_assets() assert isinstance(builder.assets, list) assert len(builder.assets) == num_assets
def test_builder_save(root_path, parser): builder = Builder(root_path=root_path) with TemporaryDirectory() as local_dir: catalog_file = f'{local_dir}/my_catalog.csv' builder = builder.build( path_column='path', variable_column='variable_id', data_format='netcdf' ).save(catalog_file) path = f'{local_dir}/my_catalog.json' col = intake.open_esm_datastore(path) pd.testing.assert_frame_equal(col.df, builder.df) print(builder.df.shape)
def test_builder_build(root_path, depth, lazy, parser, expected_df_shape): b = Builder( root_path, depth=depth, extension='*.nc', exclude_patterns=['*/files/*', '*/latest/*'], lazy=lazy, parser=parser, ).build(path_column='path', variable_column='variable_id', data_format='netcdf') keys = {'esmcat_version', 'id', 'description', 'attributes', 'assets', 'aggregation_control'} assert set(b.esmcol_data.keys()) == keys assert b.df.shape == expected_df_shape assert isinstance(b.df, pd.DataFrame) assert len(b.filelist) == len(b.df) intersection = set(cmip6_global_attrs).intersection(set(b.df.columns)) assert intersection.issubset(set(cmip6_global_attrs))
def test_builder_save(tmp_path): builder = Builder(paths=[str(sample_data_dir / 'cesm')], depth=5, include_patterns=['*.nc']) builder.get_assets() builder.assets.append('cesm/nonexistent_file.nc') # Add an invalid file with pytest.warns(UserWarning): builder.parse(parsing_func=parse_cesm_history).clean_dataframe() with pytest.warns(UserWarning): builder.save( name='test', path_column_name='path', directory=str(tmp_path), data_format='netcdf', variable_column_name='variables', aggregations=[], groupby_attrs=[], ) assert not builder.invalid_assets.empty cat = intake.open_esm_datastore(str(tmp_path / 'test.json')) assert isinstance(cat.df, pd.DataFrame)
def test_builder_update(root_path, parser, num_items, dummy_assets): with TemporaryDirectory() as local_dir: catalog_file = f'{local_dir}/dummy.csv' builder = Builder( root_path=root_path, exclude_patterns=['*/files/*', '*/latest/*'], parser=parser ) builder = builder.build( path_column='path', variable_column='variable_id', data_format='netcdf' ) builder.save(catalog_file) df = pd.read_csv(catalog_file).head(num_items) if dummy_assets: df = df.append(dummy_assets, ignore_index=True) df.to_csv(catalog_file, index=False) builder = builder.update(catalog_file, path_column='path') assert builder.old_df.size == num_items + len(dummy_assets) assert (builder.df.size - builder.old_df.size) == builder.new_df.size - len(dummy_assets)
def test_builder_build(paths, depth, storage_options, include_patterns, exclude_patterns, num_assets): builder = Builder( paths=paths, depth=depth, storage_options=storage_options, include_patterns=include_patterns, exclude_patterns=exclude_patterns, ) builder.get_assets() assert len(builder.assets) == num_assets builder.build( parsing_func=parsing_func, postprocess_func=post_process_func, postprocess_func_kwargs={'times': 100}, ) assert isinstance(builder.df, pd.DataFrame) assert len(builder.df) == num_assets assert set(builder.df.columns) == {'path', 'variable', 'my_column'}
def test_builder_invalid_parser(): with pytest.raises(TypeError): _ = Builder(root_path='./', parser='my_func')
def test_builder_invalid_root_path(): with pytest.raises(FileNotFoundError): _ = Builder(root_path='DOES_NOT_EXIST')
class YAMLParser: """ Creates a parser that parses a yaml file in order to create a catalog file """ def __init__( self, yaml_path: str, csv_path: str = None, validater: str = 'yamale', ) -> 'YAMLParser': """ Get a list of files from a list of directories. Parameters ---------- yaml_path : str Path to the yaml file to be parsed csv_path : str, optional Full path to the output csv file validater : str, optional Choice of yaml validater. Valid options: 'yamale' or 'internal'; Default: yamale """ import yaml self.yaml_path = yaml_path self.csv_path = csv_path self.builder = None self.validater = validater # Read in the yaml file and validate with open(self.yaml_path, 'r') as f: self.input_yaml = yaml.safe_load(f) self.valid_yaml = self._validate_yaml() def _validate_yaml(self): """ Validates the generic yaml input against the schema. It uses either yamale or the internal validater. Parameters ---------- None Returns ------- boolean True - passes the validation, False - fails the validation """ # verify the format is correct if self.validater == 'yamale': import yamale print('Validating yaml file with yamale.') cwd = Path(os.path.dirname(__file__)) schema_path = str(cwd.parent / 'schema') + '/generic_schema.yaml' schema = yamale.make_schema(schema_path) data = yamale.make_data(self.yaml_path) try: yamale.validate(schema, data, strict=False) print('Validation success! 👍') return True except ValueError as e: print( 'Yamale found that your file, ' + self.yaml_path + ' is not formatted correctly.' ) print(e) return False else: print('Did not validate yaml.') print('If unexpected results occur, try installing yamale and rerun.') return True def _parser_netcdf(self, filepath, local_attrs): """ Opens a netcdf file in order to gather time and requested attribute information. Also attaches assigned attributes gathered from the yaml file. Parameters ---------- filepath : str The full path to the netcdf file to attatch attributes to. local_attrs : dict Holds attributes that need to be attached to the filenpath. Returns ------- dict Returns all of the attributes that need to be assigned to the netcdf. """ fileparts = {} try: fileparts['variable'] = [] fileparts['start_time'] = [] fileparts['end_time'] = [] fileparts['path'] = [] # open file d = nc.Dataset(filepath, 'r') # find what the time (unlimited) dimension is dims = list(dict(d.dimensions).keys()) # loop through all variables for v in d.variables: # add all variables that are not coordinates to the catalog if v not in dims: fileparts['variable'].append(v) fileparts['path'].append(filepath) if 'time' in d.variables.keys(): times = d['time'] fileparts['start_time'].append(times[0]) fileparts['end_time'].append(times[-1]) # add global attributes for g in local_attrs['global'].keys(): if g not in fileparts.keys(): fileparts[g] = [] fileparts[g].append(local_attrs['global'][g]) # add the keys that are common just to the particular glob string # fileparts.update(local_attrs[filepath]) for lv in local_attrs[filepath].keys(): if lv not in fileparts.keys(): fileparts[lv] = [] if '<<' in local_attrs[filepath][lv]: if hasattr(d.variables[v], lv): fileparts[lv].append(getattr(d.variables[v], lv)) else: fileparts[lv].append('NaN') elif '<' in local_attrs[filepath][lv]: k = local_attrs[filepath][lv].replace('<', '').replace('>', '') if hasattr(d, k): fileparts[lv].append(getattr(d, k)) else: fileparts[lv].append('NaN') else: fileparts[lv].append(local_attrs[filepath][lv]) # close netcdf file d.close() except Exception: pass return fileparts def parser(self) -> 'Builder': """ Method used to start the parsing process. Parameters ---------- None Returns ------- Builder Returns a Builder object. """ # loop over datasets df_parts = [] entries = defaultdict(dict) # for dataset in input_yaml.keys(): for dataset in self.input_yaml['catalog']: # get a list of keys that are common to all files in the dataset entries['global'] = {} for g in dataset.keys(): if 'data_sources' not in g and 'ensemble' not in g: entries['global'][g] = dataset[g] # loop over ensemble members, if they exist if 'ensemble' in dataset.keys(): for member in dataset['ensemble']: glob_string = member.pop('glob_string') filelist = glob.glob(glob_string) for f in filelist: entries[f].update(member) # loop over all of the data_sources for the dataset, create a dataframe # for each data_source, append that dataframe to a list that will contain # the full dataframe (or catalog) based on everything in the yaml file. for stream_info in dataset['data_sources']: filelist = glob.glob(stream_info['glob_string']) stream_info.pop('glob_string') for f in filelist: entries[f].update(stream_info) partial_parser_netcdf = functools.partial(self._parser_netcdf, local_attrs=entries) self.builder = Builder(None, parser=partial_parser_netcdf, lazy=False) self.builder.filelist = [x for x in entries.keys() if x != 'global'] df_parts.append( self.builder.build('path', 'variable') .df.set_index('path') .apply(lambda x: x.apply(pd.Series).stack()) .reset_index() .drop('level_1', 1) ) # create the combined dataframe from all of the data_sources and datasets from # the yaml file df = pd.concat(df_parts, sort=False) self.builder.df = df.sort_values(by=['path']) return self.builder