def parse_dir(path, depth=12, ext='*.nc', **kwargs):
    """Retrieve all netCDF under a certain path (including sub-directories and parse their names according to a
    given pattern (`file_fmt`). Returns a pd.DataFrame of the parsed elements.
    """
    path = Path(path).expanduser()

    # files = all_files = path.rglob('*.nc')
    # files = [x.name for x in all_files]
    cmd = f"find {path.as_posix()} -maxdepth {depth} -iname '{ext}'"
    find_res = subprocess.run(cmd, shell=True, capture_output=True)
    all_files = find_res.stdout.decode('utf-8').split()
    # files = all_files = glob.glob(os.path.join(path, '**/*.nc'), recursive=True)
    dirs, files = zip(*map(os.path.split, all_files))
    # files, ext = zip(*map(os.path.splitext, files))

    # guess version by 1st file
    cmip_version = kwargs.pop('cmip_version', None)
    if 'file_fmt' in kwargs:
        file_fmt = kwargs.pop('file_fmt')
    else:
        parser = CMIPparser(cmip_version, guess_by=all_files[0])
        file_fmt = parser.filename_template

    # TODO: try-except? Or how can you filter out those files that don't match file_fmt?
    # TODO: how to take into account gridspec files and normal temporal files in the same directory?
    # TODO: Check cmip.py at https://github.com/NCAR/intake-esm-datastore/blob/master/builders/cmip.py
    rev_dict = reverse_formats(file_fmt, files)
    rev_dict['path'] = all_files

    return pd.DataFrame.from_dict(rev_dict)
Beispiel #2
0
    def _open_files(self, files):
        das = [xr.open_rasterio(f, chunks=self.chunks, **self._kwargs)
               for f in files]
        out = xr.concat(das, dim=self.dim)

        coords = {}
        if self.pattern:
            coords = {
                k: xr.concat(
                    [xr.DataArray(
                        np.full(das[i].sizes.get(self.dim, 1), v),
                        dims=self.dim
                    ) for i, v in enumerate(values)], dim=self.dim)
                for k, values in reverse_formats(self.pattern, files).items()
            }

        return out.assign_coords(**coords).chunk(self.chunks)
Beispiel #3
0
    def _open_files(self, files):
        """
        This function is called when the data source refers to more
        than one file either as a list or a glob. It sets up the
        dask graph for opening the files.

        Parameters
        ----------
        files : iter
            List of file objects
        """
        import pandas as pd
        from xarray import DataArray

        out = multireader(files, self.chunks, self.concat_dim, **self._kwargs)
        if not self.pattern:
            return out

        coords = {}
        filenames = [f.path for f in files]
        field_values = reverse_formats(self.pattern, filenames)

        if isinstance(self.concat_dim, list):
            if not set(field_values.keys()).issuperset(set(self.concat_dim)):
                raise KeyError('All concat_dims should be in pattern.')
            index = pd.MultiIndex.from_tuples(
                zip(*(field_values[dim] for dim in self.concat_dim)),
                names=self.concat_dim)
            coords = {
                k: DataArray(v, dims=('dim_0'))
                for k, v in field_values.items() if k not in self.concat_dim
            }
            out = (
                out.assign_coords(dim_0=index, **coords)  # use the index
                .unstack().chunk(self.chunks))  # unstack along new index
            return out.transpose(
                *self.concat_dim,  # reorder dims
                *filter(lambda x: x not in self.concat_dim, out.dims))
        else:
            coords = {
                k: DataArray(v, dims=self.concat_dim)
                for k, v in field_values.items()
            }
            return out.assign_coords(**coords).chunk(self.chunks)
Beispiel #4
0
def test_roundtrip_reverse_formats(pattern):
    args = reverse_formats(pattern, paths)
    for i, path  in enumerate(paths):
        assert pattern.format(
            **{field: values[i] for field, values in  args.items()}) == path