Ejemplo n.º 1
0
    def parser(self) -> 'Builder':
        """
        Method used to start the parsing process.

        Parameters
        ----------
        None

        Returns
        -------
        Builder
            Returns a Builder object.
        """

        # loop over datasets
        df_parts = []
        entries = defaultdict(dict)
        # for dataset in input_yaml.keys():
        for dataset in self.input_yaml['catalog']:
            # get a list of keys that are common to all files in the dataset
            entries['global'] = {}
            for g in dataset.keys():
                if 'data_sources' not in g and 'ensemble' not in g:
                    entries['global'][g] = dataset[g]
            # loop over ensemble members, if they exist
            if 'ensemble' in dataset.keys():
                for member in dataset['ensemble']:
                    glob_string = member.pop('glob_string')
                    filelist = glob.glob(glob_string)
                    for f in filelist:
                        entries[f].update(member)
            # loop over all of the data_sources for the dataset, create a dataframe
            # for each data_source, append that dataframe to a list that will contain
            # the full dataframe (or catalog) based on everything in the yaml file.
            for stream_info in dataset['data_sources']:
                filelist = glob.glob(stream_info['glob_string'])
                stream_info.pop('glob_string')
                for f in filelist:
                    entries[f].update(stream_info)

            partial_parser_netcdf = functools.partial(self._parser_netcdf, local_attrs=entries)
            self.builder = Builder(None, parser=partial_parser_netcdf, lazy=False)
            self.builder.filelist = [x for x in entries.keys() if x != 'global']
            df_parts.append(
                self.builder.build('path', 'variable')
                .df.set_index('path')
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop('level_1', 1)
            )
            # create the combined dataframe from all of the data_sources and datasets from
            # the yaml file
            df = pd.concat(df_parts, sort=False)

        self.builder.df = df.sort_values(by=['path'])
        return self.builder
Ejemplo n.º 2
0
def test_builder_init(paths, depth, storage_options, include_patterns,
                      exclude_patterns, num_assets):
    builder = Builder(
        paths=paths,
        depth=depth,
        storage_options=storage_options,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )
    builder.get_assets()
    assert isinstance(builder.assets, list)
    assert len(builder.assets) == num_assets
Ejemplo n.º 3
0
def test_builder_save(root_path, parser):
    builder = Builder(root_path=root_path)

    with TemporaryDirectory() as local_dir:
        catalog_file = f'{local_dir}/my_catalog.csv'

        builder = builder.build(
            path_column='path', variable_column='variable_id', data_format='netcdf'
        ).save(catalog_file)
        path = f'{local_dir}/my_catalog.json'
        col = intake.open_esm_datastore(path)
        pd.testing.assert_frame_equal(col.df, builder.df)
        print(builder.df.shape)
Ejemplo n.º 4
0
def test_builder_build(root_path, depth, lazy, parser, expected_df_shape):
    b = Builder(
        root_path,
        depth=depth,
        extension='*.nc',
        exclude_patterns=['*/files/*', '*/latest/*'],
        lazy=lazy,
        parser=parser,
    ).build(path_column='path', variable_column='variable_id', data_format='netcdf')

    keys = {'esmcat_version', 'id', 'description', 'attributes', 'assets', 'aggregation_control'}
    assert set(b.esmcol_data.keys()) == keys

    assert b.df.shape == expected_df_shape
    assert isinstance(b.df, pd.DataFrame)
    assert len(b.filelist) == len(b.df)
    intersection = set(cmip6_global_attrs).intersection(set(b.df.columns))
    assert intersection.issubset(set(cmip6_global_attrs))
Ejemplo n.º 5
0
def test_builder_save(tmp_path):
    builder = Builder(paths=[str(sample_data_dir / 'cesm')],
                      depth=5,
                      include_patterns=['*.nc'])
    builder.get_assets()
    builder.assets.append('cesm/nonexistent_file.nc')  # Add an invalid file

    with pytest.warns(UserWarning):
        builder.parse(parsing_func=parse_cesm_history).clean_dataframe()
    with pytest.warns(UserWarning):
        builder.save(
            name='test',
            path_column_name='path',
            directory=str(tmp_path),
            data_format='netcdf',
            variable_column_name='variables',
            aggregations=[],
            groupby_attrs=[],
        )
    assert not builder.invalid_assets.empty
    cat = intake.open_esm_datastore(str(tmp_path / 'test.json'))
    assert isinstance(cat.df, pd.DataFrame)
Ejemplo n.º 6
0
def test_builder_update(root_path, parser, num_items, dummy_assets):

    with TemporaryDirectory() as local_dir:
        catalog_file = f'{local_dir}/dummy.csv'
        builder = Builder(
            root_path=root_path, exclude_patterns=['*/files/*', '*/latest/*'], parser=parser
        )
        builder = builder.build(
            path_column='path', variable_column='variable_id', data_format='netcdf'
        )
        builder.save(catalog_file)
        df = pd.read_csv(catalog_file).head(num_items)
        if dummy_assets:
            df = df.append(dummy_assets, ignore_index=True)

        df.to_csv(catalog_file, index=False)
        builder = builder.update(catalog_file, path_column='path')
        assert builder.old_df.size == num_items + len(dummy_assets)
        assert (builder.df.size - builder.old_df.size) == builder.new_df.size - len(dummy_assets)
Ejemplo n.º 7
0
def test_builder_build(paths, depth, storage_options, include_patterns,
                       exclude_patterns, num_assets):
    builder = Builder(
        paths=paths,
        depth=depth,
        storage_options=storage_options,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )
    builder.get_assets()
    assert len(builder.assets) == num_assets
    builder.build(
        parsing_func=parsing_func,
        postprocess_func=post_process_func,
        postprocess_func_kwargs={'times': 100},
    )
    assert isinstance(builder.df, pd.DataFrame)
    assert len(builder.df) == num_assets
    assert set(builder.df.columns) == {'path', 'variable', 'my_column'}
Ejemplo n.º 8
0
def test_builder_invalid_parser():
    with pytest.raises(TypeError):
        _ = Builder(root_path='./', parser='my_func')
Ejemplo n.º 9
0
def test_builder_invalid_root_path():
    with pytest.raises(FileNotFoundError):
        _ = Builder(root_path='DOES_NOT_EXIST')
Ejemplo n.º 10
0
class YAMLParser:
    """
    Creates a parser that parses a yaml file in order to create a catalog file
    """

    def __init__(
        self, yaml_path: str, csv_path: str = None, validater: str = 'yamale',
    ) -> 'YAMLParser':
        """
        Get a list of files from a list of directories.

        Parameters
        ----------
        yaml_path : str
            Path to the yaml file to be parsed
        csv_path : str, optional
            Full path to the output csv file
        validater : str, optional
            Choice of yaml validater.  Valid options: 'yamale' or 'internal'; Default: yamale
        """

        import yaml

        self.yaml_path = yaml_path
        self.csv_path = csv_path
        self.builder = None
        self.validater = validater

        # Read in the yaml file and validate
        with open(self.yaml_path, 'r') as f:
            self.input_yaml = yaml.safe_load(f)
        self.valid_yaml = self._validate_yaml()

    def _validate_yaml(self):
        """
        Validates the generic yaml input against the schema.  It uses either yamale or the internal validater.

        Parameters
        ----------
        None

        Returns
        -------
        boolean
            True - passes the validation, False - fails the validation
        """

        # verify the format is correct
        if self.validater == 'yamale':

            import yamale

            print('Validating yaml file with yamale.')
            cwd = Path(os.path.dirname(__file__))
            schema_path = str(cwd.parent / 'schema') + '/generic_schema.yaml'
            schema = yamale.make_schema(schema_path)
            data = yamale.make_data(self.yaml_path)
            try:
                yamale.validate(schema, data, strict=False)
                print('Validation success! 👍')
                return True
            except ValueError as e:
                print(
                    'Yamale found that your file, '
                    + self.yaml_path
                    + ' is not formatted correctly.'
                )
                print(e)
                return False
        else:
            print('Did not validate yaml.')
            print('If unexpected results occur, try installing yamale and rerun.')
            return True

    def _parser_netcdf(self, filepath, local_attrs):
        """
        Opens a netcdf file in order to gather time and requested attribute information.
        Also attaches assigned attributes gathered from the yaml file.

        Parameters
        ----------
        filepath : str
            The full path to the netcdf file to attatch attributes to.
        local_attrs : dict
            Holds attributes that need to be attached to the filenpath.

        Returns
        -------
        dict
            Returns all of the attributes that need to be assigned to the netcdf.
        """

        fileparts = {}

        try:
            fileparts['variable'] = []
            fileparts['start_time'] = []
            fileparts['end_time'] = []
            fileparts['path'] = []

            # open file
            d = nc.Dataset(filepath, 'r')

            # find what the time (unlimited) dimension is
            dims = list(dict(d.dimensions).keys())

            # loop through all variables
            for v in d.variables:
                # add all variables that are not coordinates to the catalog
                if v not in dims:
                    fileparts['variable'].append(v)
                    fileparts['path'].append(filepath)

                    if 'time' in d.variables.keys():
                        times = d['time']
                        fileparts['start_time'].append(times[0])
                        fileparts['end_time'].append(times[-1])

                    # add global attributes
                    for g in local_attrs['global'].keys():
                        if g not in fileparts.keys():
                            fileparts[g] = []
                        fileparts[g].append(local_attrs['global'][g])

                    # add the keys that are common just to the particular glob string
                    # fileparts.update(local_attrs[filepath])
                    for lv in local_attrs[filepath].keys():
                        if lv not in fileparts.keys():
                            fileparts[lv] = []
                        if '<<' in local_attrs[filepath][lv]:
                            if hasattr(d.variables[v], lv):
                                fileparts[lv].append(getattr(d.variables[v], lv))
                            else:
                                fileparts[lv].append('NaN')
                        elif '<' in local_attrs[filepath][lv]:
                            k = local_attrs[filepath][lv].replace('<', '').replace('>', '')
                            if hasattr(d, k):
                                fileparts[lv].append(getattr(d, k))
                            else:
                                fileparts[lv].append('NaN')
                        else:
                            fileparts[lv].append(local_attrs[filepath][lv])
            # close netcdf file
            d.close()
        except Exception:
            pass
        return fileparts

    def parser(self) -> 'Builder':
        """
        Method used to start the parsing process.

        Parameters
        ----------
        None

        Returns
        -------
        Builder
            Returns a Builder object.
        """

        # loop over datasets
        df_parts = []
        entries = defaultdict(dict)
        # for dataset in input_yaml.keys():
        for dataset in self.input_yaml['catalog']:
            # get a list of keys that are common to all files in the dataset
            entries['global'] = {}
            for g in dataset.keys():
                if 'data_sources' not in g and 'ensemble' not in g:
                    entries['global'][g] = dataset[g]
            # loop over ensemble members, if they exist
            if 'ensemble' in dataset.keys():
                for member in dataset['ensemble']:
                    glob_string = member.pop('glob_string')
                    filelist = glob.glob(glob_string)
                    for f in filelist:
                        entries[f].update(member)
            # loop over all of the data_sources for the dataset, create a dataframe
            # for each data_source, append that dataframe to a list that will contain
            # the full dataframe (or catalog) based on everything in the yaml file.
            for stream_info in dataset['data_sources']:
                filelist = glob.glob(stream_info['glob_string'])
                stream_info.pop('glob_string')
                for f in filelist:
                    entries[f].update(stream_info)

            partial_parser_netcdf = functools.partial(self._parser_netcdf, local_attrs=entries)
            self.builder = Builder(None, parser=partial_parser_netcdf, lazy=False)
            self.builder.filelist = [x for x in entries.keys() if x != 'global']
            df_parts.append(
                self.builder.build('path', 'variable')
                .df.set_index('path')
                .apply(lambda x: x.apply(pd.Series).stack())
                .reset_index()
                .drop('level_1', 1)
            )
            # create the combined dataframe from all of the data_sources and datasets from
            # the yaml file
            df = pd.concat(df_parts, sort=False)

        self.builder.df = df.sort_values(by=['path'])
        return self.builder