Beispiel #1
0
def open_mf_wrf_dataset(paths,
                        chunks=None,
                        compat='no_conflicts',
                        lock=None,
                        preprocess=None):
    """Open multiple WRF files as a single WRF dataset.

    Requires dask to be installed. Note that if your files are sliced by time,
    certain diagnostic variable computed out of accumulated variables (e.g.
    PRCP) won't be available, because not computable lazily.

    This code is adapted from xarray's open_mfdataset function. The xarray
    license is reproduced in the salem/licenses directory.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form `path/to/my/files/*.nc` or an
        explicit list of files to open.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by chunk
        sizes. In general, these should divide the dimensions of each dataset.
        If int, chunk each dimension by ``chunks`` .
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see
        xarray's full documentation for more details.
    compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to concatenation.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.

    Returns
    -------
    xarray.Dataset
    """

    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError('no files to open')

    # TODO: current workaround to dask thread problems
    import dask
    dask.config.set(scheduler='single-threaded')

    if lock is None:
        lock = NETCDF4_PYTHON_LOCK
    try:
        datasets = [
            open_wrf_dataset(p, chunks=chunks or {}, lock=lock) for p in paths
        ]
    except TypeError as err:
        if 'lock' not in str(err):
            raise
        # New xarray backends
        datasets = [open_wrf_dataset(p, chunks=chunks or {}) for p in paths]

    orig_datasets = datasets

    def ds_closer():
        for ods in orig_datasets:
            ods.close()

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    try:
        combined = xr.combine_nested(datasets,
                                     concat_dim='time',
                                     compat=compat)
    except AttributeError:
        combined = xr.auto_combine(datasets, concat_dim='time', compat=compat)
    combined.attrs = datasets[0].attrs

    try:
        combined.set_close(ds_closer)
    except AttributeError:
        from xarray.backends.api import _MultiFileCloser
        mfc = _MultiFileCloser([ods._file_obj for ods in orig_datasets])
        combined._file_obj = mfc

    # drop accumulated vars if needed (TODO: make this not hard coded)
    vns = ['PRCP', 'PRCP_C', 'PRCP_NC']
    vns = [vn for vn in vns if vn in combined.variables]
    try:
        combined = combined.drop_vars(vns)
    except AttributeError:
        combined = combined.drop(vns)

    return combined
Beispiel #2
0
def open_mfbpchdataset(paths,
                       concat_dim='time',
                       compat='no_conflicts',
                       preprocess=None,
                       lock=None,
                       **kwargs):
    """ Open multiple bpch files as a single dataset.

    You must have dask installed for this to work, as this greatly
    simplifies issues relating to multi-file I/O.

    Also, please note that this is not a very performant routine. I/O is still
    limited by the fact that we need to manually scan/read through each bpch
    file so that we can figure out what its contents are, since that metadata
    isn't saved anywhere. So this routine will actually sequentially load
    Datasets for each bpch file, then concatenate them along the "time" axis.
    You may wish to simply process each file individually, coerce to NetCDF,
    and then ingest through xarray as normal.

    Parameters
    ----------
    paths : list of strs
        Filenames to load; order doesn't matter as they will be
        lexicographically sorted before we read in the data
    concat_dim : str, default='time'
        Dimension to concatenate Datasets over. We default to "time" since this
        is how GEOS-Chem splits output files
    compat : str (optional)
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable (optional)
        A pre-processing function to apply to each Dataset prior to
        concatenation
    lock : False, True, or threading.Lock (optional)
        Passed to :py:func:`dask.array.from_array`. By default, xarray
        employs a per-variable lock when reading data from NetCDF files,
        but this model has not yet been extended or implemented for bpch files
        and so this is not actually used. However, it is likely necessary
        before dask's multi-threaded backend can be used
    **kwargs : optional
        Additional arguments to pass to :py:func:`xbpch.open_bpchdataset`.
    
    """

    from xarray.backends.api import _MultiFileCloser

    # TODO: Include file locks?

    # Check for dask
    dask = kwargs.pop('dask', False)
    if not dask:
        raise ValueError(
            "Reading multiple files without dask is not supported")
    kwargs['dask'] = True

    # Add th

    if isinstance(paths, str):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError("No paths to files were passed into open_mfbpchdataset")

    datasets = [open_bpchdataset(filename, **kwargs) for filename in paths]
    bpch_objs = [ds._file_obj for ds in datasets]

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    # Concatenate over time
    combined = xr.auto_combine(datasets, compat=compat, concat_dim=concat_dim)

    combined._file_obj = _MultiFileCloser(bpch_objs)
    combined.attrs = datasets[0].attrs
    ts = get_timestamp()
    fns_str = " ".join(paths)
    combined.attrs['history'] = (
        "{}: Processed/loaded by xbpch-{} from {}".format(ts, ver, fns_str))

    return combined
Beispiel #3
0
def open_mf_wrf_dataset(paths, chunks=None,  compat='no_conflicts', lock=None,
                        preprocess=None):
    """Open multiple WRF files as a single WRF dataset.

    Requires dask to be installed. Note that if your files are sliced by time,
    certain diagnostic variable computed out of accumulated variables (e.g.
    PRCP) won't be available, because not computable lazily.

    This code is adapted from xarray's open_mfdataset function. The xarray
    license is reproduced in the salem/licenses directory.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an explicit
        list of files to open.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by chunk
        sizes. In general, these should divide the dimensions of each dataset.
        If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see
        xarray's full documentation for more details.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to concatenation.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.

    Returns
    -------
    xarray.Dataset

    """
    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError('no files to open')

    # TODO: current workaround to dask thread problems
    dask.set_options(get=dask.async.get_sync)

    if lock is None:
        lock = _default_lock(paths[0], 'netcdf4')
    datasets = [open_wrf_dataset(p, chunks=chunks or {}, lock=lock)
                for p in paths]
    file_objs = [ds._file_obj for ds in datasets]

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    # TODO: add compat=compat when xarray 9.0 is out
    combined = xr.auto_combine(datasets, concat_dim='time')
    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs

    # drop accumulated vars if needed (TODO: make this not hard coded)
    vns = ['PRCP', 'PRCP_C', 'PRCP_NC']
    vns = [vn for vn in vns if vn in combined.variables]
    combined = combined.drop(vns)

    return combined
def open_mfdataset(paths,
                   chunks=None,
                   concat_dim=_CONCAT_DIM_DEFAULT,
                   compat='no_conflicts',
                   preprocess=None,
                   engine=None,
                   lock=None,
                   **kwargs):
    '''Open multiple files as a single dataset.

    This function is adapted from the xarray function of the same name.
    The main difference is that instead of failing on files that do not
    exist, this function keeps processing.

    Requires dask to be installed.  Attributes from the first dataset file
    are used for the combined dataset.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an
        explicit list of files to open.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by
        chunk sizes. In general, these should divide the dimensions of each
        dataset. If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see
        the full documentation for more details.
    concat_dim : None, str, DataArray or Index, optional
        Dimension to concatenate files along. This argument is passed on to
        :py:func:`xarray.auto_combine` along with the dataset objects. You
        only need to provide this argument if the dimension along which you
        want to concatenate is not a dimension in the original datasets,
        e.g., if you want to stack a collection of 2D arrays along a third
        dimension. By default, xarray attempts to infer this argument by
        examining component files. Set ``concat_dim=None`` explicitly to
        disable concatenation.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:
        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to
        concatenation.
    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
        Engine to use when reading files. If not provided, the default
        engine is chosen based on available dependencies, with a preference
        for 'netcdf4'.
    autoclose : bool, optional
        If True, automatically close files to avoid OS Error of too many
        files being open.  However, this option doesn't work with streams,
        e.g., BytesIO.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.
    **kwargs : optional
        Additional arguments passed on to :py:func:`xarray.open_dataset`.
    Returns
    -------
    xarray.Dataset
    See Also
    --------
    auto_combine
    open_dataset

    '''
    filterwarnings('ignore', 'elementwise comparison failed;')
    filterwarnings('ignore', 'numpy equal will not check object')

    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError('no files to open')

    if lock is None:
        lock = _default_lock(paths[0], engine)
    datasets = [
        _open_dataset(p,
                      engine=engine,
                      chunks=chunks or {},
                      lock=lock,
                      **kwargs) for p in paths
    ]
    file_objs = [ds._file_obj for ds in datasets if ds is not None]

    if isinstance(concat_dim, pd.Index):
        name = concat_dim.name
        concat_dim = concat_dim.take(
            [ind for ind, ds in enumerate(datasets) if ds is not None])
        concat_dim.name = name

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets if ds is not None]

    if concat_dim is _CONCAT_DIM_DEFAULT:
        combined = auto_combine(datasets, compat=compat)
    else:
        combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs

    return combined