Esempio n. 1
def tseries_point(ds: xr.Dataset,
                  point: PointLike.TYPE,
                  var: VarNamesLike.TYPE = None,
                  method: str = 'nearest') -> xr.Dataset:
    Extract time-series from *ds* at given *lat*, *lon* position using
    interpolation *method* for each *var* given in a comma separated list of

    The operation returns a new timeseries dataset, that contains the point
    timeseries for all required variables with original variable
    meta-information preserved.

    If a variable has more than three dimensions, the resulting timeseries
    variable will preserve all other dimensions except for lat/lon.

    :param ds: The dataset from which to perform timeseries extraction.
    :param point: Point to extract
    :param var: Variable(s) for which to perform the timeseries selection
                if none is given, all variables in the dataset will be used.
    :param method: Interpolation method to use.
    :return: A timeseries dataset
    point = PointLike.convert(point)
    lon = point.x
    lat = point.y

    if not var:
        var = '*'

    retset = select_var(ds, var=var)
    indexers = {'lat': lat, 'lon': lon}
    return retset.sel(method=method, **indexers)
Esempio n. 2
    def test_select(self):
        dataset = xr.Dataset({'abc': ('x', [1, 2, 3]),
                              'bde': ('x', [4, 5, 6])})

        # Test if nothing gets dropped if nothing has to be dropped
        actual = select_var(dataset)
        self.assertDatasetEqual(dataset, actual)

        actual = select_var(dataset, var='')
        self.assertDatasetEqual(dataset, actual)

        # Test that everything is dropped if the desired name does not exist in
        # the dataset
        expected = xr.Dataset({'abc': ('x', [1, 2, 3])})
        expected = expected.drop('abc')
        actual = select_var(dataset, var='xyz')
        self.assertDatasetEqual(expected, actual)

        # Test that a single variable selection works
        actual = select_var(dataset, var='abc')
        expected = xr.Dataset({'abc': ('x', [1, 2, 3])})
        self.assertDatasetEqual(expected, actual)

        # Test that simple multiple variable selection works
        actual = select_var(dataset, var='abc,bde')
        self.assertDatasetEqual(dataset, actual)

        # Test that wildcard selection works
        actual = select_var(dataset, var='*b*')
        self.assertDatasetEqual(dataset, actual)
Esempio n. 3
    def test_select(self):
        dataset = xr.Dataset({
            'abc': ('x', [1, 2, 3]),
            'bde': ('x', [4, 5, 6])

        # Test if nothing gets dropped if nothing has to be dropped
        actual = select_var(dataset)
        self.assertDatasetEqual(dataset, actual)

        actual = select_var(dataset, var='')
        self.assertDatasetEqual(dataset, actual)

        # Test that everything is dropped if the desired name does not exist in
        # the dataset
        expected = xr.Dataset({'abc': ('x', [1, 2, 3])})
        expected = expected.drop('abc')
        actual = select_var(dataset, var='xyz')
        self.assertDatasetEqual(expected, actual)

        # Test that a single variable selection works
        actual = select_var(dataset, var='abc')
        expected = xr.Dataset({'abc': ('x', [1, 2, 3])})
        self.assertDatasetEqual(expected, actual)

        # Test that simple multiple variable selection works
        actual = select_var(dataset, var='abc,bde')
        self.assertDatasetEqual(dataset, actual)

        # Test that wildcard selection works
        actual = select_var(dataset, var='*b*')
        self.assertDatasetEqual(dataset, actual)
Esempio n. 4
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution. Running'
            ' the adjust_temporal_attrs operation beforehand may'
            ' help.')

    var = VarNamesLike.convert(var)
    # Shallow

    if var:
        ds = select_var(ds, var)

    if t_resolution == 'P1D':
        return _lta_daily(ds)
    elif t_resolution == 'P1M':
        return _lta_monthly(ds, monitor)
        return _lta_general(ds, monitor)
Esempio n. 5
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Long term average operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution. Running'
                              ' the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    if t_resolution == 'P1D':
        return _lta_daily(retset, monitor)
    elif t_resolution == 'P1M':
        return _lta_monthly(retset, monitor)
        return _lta_general(retset, monitor)
Esempio n. 6
def tseries_point(ds: xr.Dataset,
                  point: PointLike.TYPE,
                  var: VarNamesLike.TYPE = None,
                  method: str = 'nearest') -> xr.Dataset:
    Extract time-series from *ds* at given *lon*, *lat* position using
    interpolation *method* for each *var* given in a comma separated list of

    The operation returns a new timeseries dataset, that contains the point
    timeseries for all required variables with original variable
    meta-information preserved.

    If a variable has more than three dimensions, the resulting timeseries
    variable will preserve all other dimensions except for lon/lat.

    :param ds: The dataset from which to perform timeseries extraction.
    :param point: Point to extract, e.g. (lon,lat)
    :param var: Variable(s) for which to perform the timeseries selection
                if none is given, all variables in the dataset will be used.
    :param method: Interpolation method to use.
    :return: A timeseries dataset
    point = PointLike.convert(point)
    lon = point.x
    lat = point.y

    if not var:
        var = '*'

    retset = select_var(ds, var=var)
    indexers = {'lat': lat, 'lon': lon}
    retset = retset.sel(method=method, **indexers)

    # The dataset is no longer a spatial dataset -> drop associated global
    # attributes
    drop = [
        'geospatial_bounds_crs', 'geospatial_bounds_vertical_crs',
        'geospatial_vertical_min', 'geospatial_vertical_max',
        'geospatial_vertical_positive', 'geospatial_vertical_units',
        'geospatial_vertical_resolution', 'geospatial_lon_min',
        'geospatial_lat_min', 'geospatial_lon_max', 'geospatial_lat_max'

    for key in drop:
        retset.attrs.pop(key, None)

    return retset
Esempio n. 7
File: Progetto: whigg/cate
def _generic_index_calculation(
        ds: xr.Dataset,
        var: VarName.TYPE,
        region: PolygonLike.TYPE,
        window: int,
        file: str,
        name: str,
        threshold: float = None,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values,
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
    return retval
Esempio n. 8
def tseries_mean(ds: xr.Dataset,
                 var: VarNamesLike.TYPE,
                 std_suffix: str = '_std',
                 calculate_std: bool = True,
                 monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    Extract spatial mean timeseries of the provided variables, return the
    dataset that in addition to all the information in the given dataset
    contains also timeseries data for the provided variables, following
    naming convention 'var_name1_ts_mean'

    If a data variable with more dimensions than time/lat/lon is provided,
    the data will be reduced by taking the mean of all data values at a single
    time position resulting in one dimensional timeseries data variable.

    :param ds: The dataset from which to perform timeseries extraction.
    :param var: Variables for which to perform timeseries extraction
    :param calculate_std: Whether to calculate std in addition to mean
    :param std_suffix: Std suffix to use for resulting datasets, if std is calculated.
    :param monitor: a progress monitor.
    :return: Dataset with timeseries variables
    if not var:
        var = '*'

    retset = select_var(ds, var)
    names = retset.data_vars.keys()

    with monitor.starting("Calculate mean", total_work=len(names)):
        for name in names:
            dims = list(ds[name].dims)
            with monitor.child(1).observing("Calculate mean"):
                retset[name] = retset[name].mean(dim=dims, keep_attrs=True)
                'Cate_Description'] = 'Mean aggregated over {} at each point in time.'.format(
            std_name = name + std_suffix
            retset[std_name] = ds[name].std(dim=dims)
                'Cate_Description'] = 'Accompanying std values for variable \'{}\''.format(

    return retset
Esempio n. 9
def _generic_index_calculation(ds: xr.Dataset,
                               var: VarName.TYPE,
                               region: PolygonLike.TYPE,
                               window: int,
                               file: str,
                               name: str,
                               threshold: float = None,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
    return retval
Esempio n. 10
def long_term_average(source: str,
                      year_min: int,
                      year_max: int,
                      file: str,
                      var: VarNamesLike.TYPE = None,
                      save: bool = False,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    Perform the long term monthly average of the given monthly or daily data
    source for the given range of years.

    Depending on the given year range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    :param source: The data source from which to extract the monthly average
    :param year_min: The earliest year of the desired time range
    :param year_max: The most recent year of the desired time range
    :param file: filepath where to save the long term average dataset
    :param var: If given, only these variable names will be preserved in the
    :param save: If True, saves the data downloaded during this operation. This
    can potentially be a very large amount of data.
    :param monitor: A progress monitor to use
    :return: The Long Term Average dataset.
    var = VarNamesLike.convert(var)

    n_years = year_max - year_min + 1
    res = 0
    total_work = 100

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query\
                         term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query\
                         term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # Check if we have a monthly data source
    fq = data_source.meta_info['time_frequency']
    if fq != 'mon':
        raise ValueError("Only monthly datasets are supported for time being.")

    with monitor.starting('LTA', total_work=total_work):
        # Set up the monitor
        step = total_work * 0.9 / n_years

        # Process the data source year by year
        year = year_min
        while year != year_max + 1:

            tmin = "{}-01-01".format(year)
            tmax = "{}-12-31".format(year)

            # Determine if the data for the given year are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this year, we do the syncing anyway.
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

                if res == 0:
                    res = ds / n_years
                    # Xarray doesn't do automatic alignment for in place
                    # operations, hence we have to do it manually
                    res = res + ds.reindex_like(res) / n_years
            except TypeError:
                raise TypeError('One or more data arrays feature a dtype that\
                                can not be divided. Consider using the var\
                                parameter to filter the dataset.')

            # delete data for the current year, if it should be deleted and it
            # was not already downloaded.
            if (not save) and (not was_already_downloaded):

            monitor.progress(work=step * 0.1)

            year = year + 1

        monitor.progress(msg='Saving the LTA dataset')
        save_dataset(res, file)
        monitor.progress(total_work * 0.1)

    return res
Esempio n. 11
def temporal_agg(source: str,
                 start_date: str = None,
                 end_date: str = None,
                 var: VarNamesLike.TYPE = None,
                 level: str = 'mon',
                 method: str = 'mean',
                 save_data: bool = False,
                 monitor: Monitor = Monitor.NONE) -> (xr.Dataset, str):
    Perform temporal aggregation of the given data source to the given level
    using the given method for the given time range. Only full time periods
    of the given time range will be aggregated.

    Depending on the given time range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    The aggregation result is saved into the local data store for later reuse.

    :param source: Data source to aggregate
    :param start_date: Start date of aggregation. If not given, data source
    start date is used instead
    :param end_date: End date of aggregation. If not given, data source end
    date is used instead
    :param var: If given, only these dataset variables will be preserved in the
    :param level: Aggregation level
    :param method: Aggregation method
    :param save_data: Whether to save data downloaded during this operation.
    This can potentially be a lot of data.
    :param monitor: A progress monitor to use
    :return: The local data source identifier for the aggregated data
    # Raise not implemented, while not finished
    raise ValueError("Operation is not implemented.")

    var = VarNamesLike.convert(var)

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query "
                         "term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query "
                         "term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # We have to do this to have temporal coverage info in meta_info

    # Check if the data source temporal resolution is known
    known_res = ('day', '8-days', 'mon', 'yr')

    fq = data_source.meta_info['time_frequency']
    if (not fq) or (fq not in known_res):
        raise ValueError("The given data source features unknown time "
                         "resolution: {}".format(fq))

    # Check if the operation supports the desired aggregation step
    valid_steps = list()
    valid_steps.append(('day', 'mon'))
    if (fq, level) not in valid_steps:
        raise ValueError("Currently the operation does not support aggregation"
                         " from {} to {}".format(fq, level))

    # Determine start and end dates
    if not start_date:
        start_date = data_source.meta_info['temporal_coverage_start']
    start_date = to_datetime(start_date)
    # If start_date is not start of the month, move it to the 1st of next
    # month
    if != 1:
            start_date = datetime(start_date.year, start_date.month + 1, 1)
        except ValueError:
            # We have tried to set the month to 13
            start_date = datetime(start_date.year + 1, 1, 1)

    if not end_date:
        end_date = data_source.meta_info['temporal_coverage_end']
    end_date = to_datetime(end_date)
    # If end date is not end of the month, move it to the last day of the
    # previous month
    if not _is_end_of_month(end_date):
            end_date = datetime(end_date.year, end_date.month - 1, 27)
        except ValueError:
            # We have tried to set the month to 0
            end_date = datetime(end_date.year - 1, 12, 31)

    end_date = _end_of_month(end_date.year, end_date.month)

    # Determine the count of processing periods
    n_periods = (end_date.year - start_date.year + 1) * 12\
        + end_date.month - start_date.month - 11
    # 2000-4-1, 2000-6-30 -> 12 + 2 -11 = 3

    if n_periods < 1:
        raise ValueError("The given time range does not contain any full "
                         "calendar months to do aggregation with.")

    # Set up the monitor
    total_work = 100
    with monitor.starting('Aggregate', total_work=total_work):
        step = total_work * 0.9 / n_periods

        # Process the data source period by period
        tmin = start_date
        while tmin < end_date:
            tmax = _end_of_month(tmin.year, tmin.month)

            # Determine if the data for the given period are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this period, we do the syncing anyway
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

            # Do the aggregation

            # Save the dataset for this period into local data store

            # Close and delete the files if needed
            # delete data for the current period,if it should be deleted and it
            # was not already downloaded.
            if (not save_data) and (not was_already_downloaded):

            monitor.progress(work=step * 0.1)

            # tmin for next iteration
                tmin = datetime(tmin.year, tmin.month + 1, 1)
            except ValueError:
                # Couldn't add a month -> end of year
                tmin = datetime(tmin.year + 1, 1, 1)

    monitor.progress(work=step * 0.1)

    # Return the local data source id
    return None
Esempio n. 12
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    Perform long term average of the given dataset by doing a mean of monthly
    values over the time range covered by the dataset. E.g. it averages all
    January values, all February values, etc, to create a dataset with twelve
    time slices each containing a mean of respective monthly values.

    For further information on climatological datasets, see

    :param ds: A monthly dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a monthly dataset
        if ds.attrs['time_coverage_resolution'] != 'P1M':
            raise ValueError(
                'Long term average operation expects a monthly dataset'
                ' running temporal aggregation on this dataset'
                ' beforehand may help.')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    time_min = pd.Timestamp(ds.time.values[0])
    time_max = pd.Timestamp(ds.time.values[-1])

    total_work = 100

    with monitor.starting('LTA', total_work=total_work):
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset