Exemple #1
0
def open_dataset(ds_id, file_paths, apply_fixes=True):
    """
    Opens an xarray Dataset and applies fixes if requested.
    Fixes are applied to the data either before or after the dataset is opened.
    Whether a fix is a 'pre-processor' or 'post-processor' is defined in the
    fix itself.

    :param ds_id: Dataset identifier in the form of a drs id
                  e.g. cmip5.output1.INM.inmcm4.rcp45.mon.ocean.Omon.r1i1p1.latest.zostoga
    :param file_paths: (list) The file paths corresponding to the ds id.
    :param apply_fixes: Boolean. If True fixes will be applied to datasets if needed. Default is True.
    :return: xarray Dataset with fixes applied to the data.
    """
    if apply_fixes:
        fix = fixer.Fixer(ds_id)
        if fix.pre_processor:
            for pre_process in fix.pre_processors:
                LOGGER.info(
                    f"Loading data with pre_processor: {pre_process.__name__}")
        else:
            LOGGER.info(f"Loading data")

        ds = open_xr_dataset(file_paths, preprocess=fix.pre_processor)

        if fix.post_processors:
            for post_process in fix.post_processors:
                func, operands = post_process
                LOGGER.info(
                    f"Running post-processing function: {func.__name__}")
                ds = func(ds, **operands)

    else:
        ds = open_xr_dataset(file_paths)

    return ds
Exemple #2
0
    def _resolve_dsets(self, ds):
        """
        Take in the `ds` object and load it as an xarray Dataset if it
        is a path/wildcard. Set the result to `self.ds`.
        """
        if isinstance(ds, (str, Path)):
            ds = expand_wildcards(ds)
            ds = open_xr_dataset(ds)

        self.ds = ds
def test_open_xr_dataset_retains_time_encoding(load_test_data):
    dset = CMIP5_TAS_EC_EARTH
    ds = open_xr_dataset(dset)
    assert isinstance(ds, xr.Dataset)
    assert hasattr(ds, "time")
    assert ds.time.encoding.get("units") == "days since 1850-01-01 00:00:00"

    # Now test without our clever opener - to prove time encoding is lost
    kwargs = {"use_cftime": True, "decode_timedelta": False, "combine": "by_coords"}
    ds = xr.open_mfdataset(glob.glob(dset), **kwargs)
    assert ds.time.encoding == {}
Exemple #4
0
 def _extract(self):
     ds = xarray_utils.open_xr_dataset(self._files)
     LOGGER.info(f"NEED TO CHECK NUMBER OF VARS/DOMAINS RETURNED HERE")
     LOGGER.info(
         f"DOES NOT CHECK YET WHETHER WE MIGHT GET 2 DOMAINS/VARIABLES BACK FROM MULTI-FILE OPEN"
     )
     # Get content by variable
     da = ds[self._var_id]
     self.character = {
         "scan_metadata": get_scan_metadata(self._mode, self._location),
         "variable": get_variable_metadata(da),
         "coordinates": get_coords(da),
         "global_attrs": get_global_attrs(ds, self._expected_attrs),
         "data": get_data_info(da, self._mode),
     }
Exemple #5
0
def consolidate(collection, **kwargs):
    """
    Finds the file paths relating to each input dataset. If a time range has been supplied then only the files
    relating to this time range are recorded.

    :param collection: (roocs_utils.CollectionParameter) The collection of datasets to process.
    :param kwargs: Arguments of the operation taking place e.g. subset, average, or re-grid.
    :return: An ordered dictionary of each dataset from the collection argument and the file paths
             relating to it.
    """
    catalog = None
    time = None

    collection = _wrap_sequence(collection.tuple)

    if not isinstance(collection[0], FileMapper):
        project = get_project_name(collection[0])
        catalog = get_catalog(project)

    filtered_refs = collections.OrderedDict()

    if "time" in kwargs:
        time = kwargs["time"].asdict()

    for dset in collection:

        if not catalog:
            try:

                consolidated = dset_to_filepaths(dset, force=True)

                if time:

                    file_paths = consolidated
                    LOGGER.info(
                        f"Testing {len(file_paths)} files in time range: ...")
                    files_in_range = []

                    ds = open_xr_dataset(dset)

                    if time["start_time"] is None:
                        time["start_time"] = ds.time.values.min().strftime(
                            "%Y")
                    if time["end_time"] is None:
                        time["end_time"] = ds.time.values.max().strftime("%Y")

                    times = [
                        int(time["start_time"].split("-")[0]),
                        int(time["end_time"].split("-")[0]) + 1,
                    ]
                    required_years = set(range(*[_ for _ in times]))

                    for i, fpath in enumerate(file_paths):

                        LOGGER.info(f"File {i}: {fpath}")

                        ds = open_xr_dataset(fpath)

                        found_years = {int(_) for _ in ds.time.dt.year}

                        if required_years.intersection(found_years):
                            files_in_range.append(fpath)

                    LOGGER.info(f"Kept {len(files_in_range)} files")
                    consolidated = files_in_range[:]
                    if len(files_in_range) == 0:
                        raise Exception(
                            f"No files found in given time range for {dset}")

            # catch where "time" attribute cannot be accessed in ds
            except AttributeError:
                pass

            filtered_refs[dset] = consolidated

        else:
            ds_id = derive_ds_id(dset)
            result = catalog.search(collection=ds_id, time=time)

            if len(result) == 0:
                result = catalog.search(collection=ds_id, time=None)
                if len(result) > 0:
                    raise Exception(
                        f"No files found in given time range for {dset}")
                else:
                    raise InvalidCollection(
                        f"{dset} is not in the list of available data.")

            LOGGER.info(f"Found {len(result)} files")

            filtered_refs = result.files()

    return filtered_refs
def test_open_xr_dataset(load_test_data):
    dset = C3S_CMIP5_TAS
    ds = open_xr_dataset(dset)
    assert isinstance(ds, xr.Dataset)