Ejemplo n.º 1
0
def open_mf_wrf_dataset(paths, chunks=None,  compat='no_conflicts', lock=None,
                        preprocess=None):
    """Open multiple WRF files as a single WRF dataset.

    Requires dask to be installed. Note that if your files are sliced by time,
    certain diagnostic variable computed out of accumulated variables (e.g.
    PRCP) won't be available, because not computable lazily.

    This code is adapted from xarray's open_mfdataset function. The xarray
    license is reproduced in the salem/licenses directory.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an explicit
        list of files to open.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by chunk
        sizes. In general, these should divide the dimensions of each dataset.
        If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see
        xarray's full documentation for more details.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to concatenation.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.

    Returns
    -------
    xarray.Dataset

    """
    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError('no files to open')

    # TODO: current workaround to dask thread problems
    dask.set_options(get=dask.async.get_sync)

    if lock is None:
        lock = _default_lock(paths[0], 'netcdf4')
    datasets = [open_wrf_dataset(p, chunks=chunks or {}, lock=lock)
                for p in paths]
    file_objs = [ds._file_obj for ds in datasets]

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    # TODO: add compat=compat when xarray 9.0 is out
    combined = xr.auto_combine(datasets, concat_dim='time')
    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs

    # drop accumulated vars if needed (TODO: make this not hard coded)
    vns = ['PRCP', 'PRCP_C', 'PRCP_NC']
    vns = [vn for vn in vns if vn in combined.variables]
    combined = combined.drop(vns)

    return combined
Ejemplo n.º 2
0
def open_mfdataset(paths,
                   chunks=None,
                   concat_dim=_CONCAT_DIM_DEFAULT,
                   compat='no_conflicts',
                   preprocess=None,
                   engine=None,
                   lock=None,
                   **kwargs):
    '''Open multiple files as a single dataset.

    This function is adapted from the xarray function of the same name.
    The main difference is that instead of failing on files that do not
    exist, this function keeps processing.

    Requires dask to be installed.  Attributes from the first dataset file
    are used for the combined dataset.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an
        explicit list of files to open.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by
        chunk sizes. In general, these should divide the dimensions of each
        dataset. If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see
        the full documentation for more details.
    concat_dim : None, str, DataArray or Index, optional
        Dimension to concatenate files along. This argument is passed on to
        :py:func:`xarray.auto_combine` along with the dataset objects. You
        only need to provide this argument if the dimension along which you
        want to concatenate is not a dimension in the original datasets,
        e.g., if you want to stack a collection of 2D arrays along a third
        dimension. By default, xarray attempts to infer this argument by
        examining component files. Set ``concat_dim=None`` explicitly to
        disable concatenation.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:
        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to
        concatenation.
    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
        Engine to use when reading files. If not provided, the default
        engine is chosen based on available dependencies, with a preference
        for 'netcdf4'.
    autoclose : bool, optional
        If True, automatically close files to avoid OS Error of too many
        files being open.  However, this option doesn't work with streams,
        e.g., BytesIO.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.
    **kwargs : optional
        Additional arguments passed on to :py:func:`xarray.open_dataset`.
    Returns
    -------
    xarray.Dataset
    See Also
    --------
    auto_combine
    open_dataset

    '''
    filterwarnings('ignore', 'elementwise comparison failed;')
    filterwarnings('ignore', 'numpy equal will not check object')

    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    if not paths:
        raise IOError('no files to open')

    if lock is None:
        lock = _default_lock(paths[0], engine)
    datasets = [
        _open_dataset(p,
                      engine=engine,
                      chunks=chunks or {},
                      lock=lock,
                      **kwargs) for p in paths
    ]
    file_objs = [ds._file_obj for ds in datasets if ds is not None]

    if isinstance(concat_dim, pd.Index):
        name = concat_dim.name
        concat_dim = concat_dim.take(
            [ind for ind, ds in enumerate(datasets) if ds is not None])
        concat_dim.name = name

    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets if ds is not None]

    if concat_dim is _CONCAT_DIM_DEFAULT:
        combined = auto_combine(datasets, compat=compat)
    else:
        combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs

    return combined