Example #1
0
def no_op(num_steps: int = 20,
          step_duration: float = 0.5,
          fail_before: bool = False,
          fail_after: bool = False,
          error_type: str = 'Value',
          monitor: Monitor = Monitor.NONE) -> bool:
    """
    An operation that basically does nothing but spending configurable time.
    It may be useful for testing purposes.

    :param num_steps: Number of steps to iterate.
    :param step_duration: How much time to spend in each step in seconds.
    :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError).
    :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError).
    :param error_type: The type of error to raise.
    :param monitor: A progress monitor.
    :return: Always True
    """
    import time
    with monitor.starting('Computing nothing', num_steps):
        if fail_before:
            error_class = _ERROR_TYPES[error_type]
            raise error_class(
                f'This is a test: intentionally failed with a {error_type} error'
                f' before {num_steps} times doing anything.')
        for i in range(num_steps):
            time.sleep(step_duration)
            monitor.progress(
                1.0, 'Step %s of %s doing nothing' % (i + 1, num_steps))
        if fail_after:
            error_class = _ERROR_TYPES[error_type]
            raise error_class(
                f'Intentionally failed failed with a {error_type} error'
                f' after {num_steps} times doing nothing.')
    return True
Example #2
0
def no_op(num_steps: int = 20,
          step_duration: float = 0.5,
          fail_before: bool = False,
          fail_after: bool = False,
          error_type: str = 'Value',
          monitor: Monitor = Monitor.NONE) -> bool:
    """
    An operation that basically does nothing but spending configurable time.
    It may be useful for testing purposes.

    :param num_steps: Number of steps to iterate.
    :param step_duration: How much time to spend in each step in seconds.
    :param fail_before: If the operation should fail before spending time doing nothing (raise a ValidationError).
    :param fail_after: If the operation should fail after spending time doing nothing (raise a ValueError).
    :param error_type: The type of error to raise.
    :param monitor: A progress monitor.
    :return: Always True
    """
    import time
    with monitor.starting('Computing nothing', num_steps):
        if fail_before:
            error_class = _ERROR_TYPES[error_type]
            raise error_class(f'This is a test: intentionally failed with a {error_type} error'
                              f' before {num_steps} times doing anything.')
        for i in range(num_steps):
            time.sleep(step_duration)
            monitor.progress(1.0, 'Step %s of %s doing nothing' % (i + 1, num_steps))
        if fail_after:
            error_class = _ERROR_TYPES[error_type]
            raise error_class(f'Intentionally failed failed with a {error_type} error'
                              f' after {num_steps} times doing nothing.')
    return True
Example #3
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = np.log(retset)
                elif item[:] == 'log10':
                    retset = np.log10(retset)
                elif item[:] == 'log2':
                    retset = np.log2(retset)
                elif item[:] == 'log1p':
                    retset = np.log1p(retset)
                elif item[:] == 'exp':
                    retset = np.exp(retset)
                else:
                    raise ValidationError('Arithmetic operation {} not'
                                          ' implemented.'.format(item[0]))

    return retset
Example #4
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = xu.log(retset)
                elif item[:] == 'log10':
                    retset = xu.log10(retset)
                elif item[:] == 'log2':
                    retset = xu.log2(retset)
                elif item[:] == 'log1p':
                    retset = xu.log1p(retset)
                elif item[:] == 'exp':
                    retset = xu.exp(retset)
                else:
                    raise ValueError('Arithmetic operation {} not'
                                     ' implemented.'.format(item[0]))

    return retset
Example #5
0
def anomaly_external(ds: xr.Dataset,
                     file: str,
                     transform: str = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly with external reference data, for example, a climatology.
    The given reference dataset is expected to consist of 12 time slices, one
    for each month.

    The returned dataset will contain the variable names found in both - the
    reference and the given dataset. Names found in the given dataset, but not in
    the reference, will be dropped from the resulting dataset. The calculated
    anomaly will be against the corresponding month of the reference data.
    E.g. January against January, etc.

    In case spatial extents differ between the reference and the given dataset,
    the anomaly will be calculated on the intersection.

    :param ds: The dataset to calculate anomalies from
    :param file: Path to reference data file
    :param transform: Apply the given transformation before calculating the anomaly.
                      For supported operations see help on 'ds_arithmetics' operation.
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    # Check if the time coordinate is of dtype datetime
    try:
        if ds.time.dtype != 'datetime64[ns]':
            raise ValidationError(
                'The dataset provided for anomaly calculation'
                ' is required to have a time coordinate of'
                ' dtype datetime64[ns]. Running the normalize'
                ' operation on this dataset might help.')
    except AttributeError:
        raise ValidationError('The dataset provided for anomaly calculation'
                              ' is required to have a time coordinate.')

    clim = xr.open_dataset(file)
    ret = ds.copy()
    if transform:
        ret = ds_arithmetics(ds, transform)
    # Group by months, subtract the appropriate slice from the reference
    # Note that this requires that 'time' coordinate labels are of type
    # datetime64[ns]
    total_work = 100
    step = 100 / 12

    with monitor.starting('Anomaly', total_work=total_work):
        monitor.progress(work=0)
        kwargs = {'ref': clim, 'monitor': monitor, 'step': step}
        ret = ret.groupby(ds['time.month']).apply(_group_anomaly, **kwargs)

    # Running groupby results in a redundant 'month' variable being added to
    # the dataset
    ret = ret.drop('month')
    return ret
Example #6
0
def _lta_daily(ds: xr.Dataset, monitor: Monitor):
    """
    Carry out a long term average of a daily dataset

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0])
    time_max = pd.Timestamp(ds.time.values[-1])
    total_work = 100
    retset = ds

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 366
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_groupby_day, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.stack(time=('month', 'day'))

    # Get rid of redundant dates
    drop = [(2, 29), (2, 30), (2, 31), (4, 31), (6, 31), (9, 31), (11, 31)]
    retset = retset.drop(drop, dim='time')

    # Turn month, day coordinates to time
    retset = retset.reset_index('time')
    retset = retset.drop(['month', 'day'])
    time_coord = pd.date_range(start='{}-01-01'.format(time_min.year),
                               end='{}-12-31'.format(time_min.year),
                               freq='D')
    if len(time_coord) == 366:
        time_coord = time_coord.drop(
            np.datetime64('{}-02-29'.format(time_min.year)))
    retset['time'] = time_coord

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (365, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Example #7
0
def _mean(ds: xr.Dataset, monitor: Monitor, step: float):
    """
    Calculate mean of the given dataset and update the given monitor.

    :param ds: Dataset to take the mean of
    :param monitor: Monitor to update
    :param step: Work step
    """
    retset = ds.mean(dim='time', keep_attrs=True)
    monitor.progress(work=step)
    return retset
Example #8
0
def _mean(ds: xr.Dataset, monitor: Monitor, step: float):
    """
    Calculate mean of the given dataset and update the given monitor.

    :param ds: Dataset to take the mean of
    :param monitor: Monitor to update
    :param step: Work step
    """
    retset = ds.mean(dim='time', keep_attrs=True)
    monitor.progress(work=step)
    return retset
Example #9
0
def _lta_daily(ds: xr.Dataset, monitor: Monitor):
    """
    Carry out a long term average of a daily dataset

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc)
    time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc)
    total_work = 100
    retset = ds

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 366
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.stack(time=('month', 'day'))

    # Get rid of redundant dates
    drop = [(2, 29), (2, 30), (2, 31), (4, 31), (6, 31),
            (9, 31), (11, 31)]
    retset = retset.drop(drop, dim='time')

    # Turn month, day coordinates to time
    retset = retset.reset_index('time')
    retset = retset.drop(['month', 'day'])
    time_coord = pd.date_range(start='{}-01-01'.format(time_min.year),
                               end='{}-12-31'.format(time_min.year),
                               freq='D')
    if len(time_coord) == 366:
        time_coord = time_coord.drop(np.datetime64('{}-02-29'.format(time_min.year)))
    retset['time'] = time_coord

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (365, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Example #10
0
 def update_indices(self,
                    update_file_lists: bool = False,
                    monitor: Monitor = Monitor.NONE):
     with monitor.starting('Updating indices', 100):
         self._init_data_sources()
         monitor.progress(work=10 if update_file_lists else 100)
         if update_file_lists:
             child_monitor = monitor.child(work=90)
             with child_monitor.starting('Updating file lists',
                                         len(self._data_sources)):
                 for data_source in self._data_sources:
                     data_source.update_file_list()
                     child_monitor.progress(work=1)
Example #11
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {
        'min': np.nanmin,
        'max': np.nanmax,
        'mean': np.nanmean,
        'median': np.nanmedian,
        'sum': np.nansum
    }

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [
            value for value in dim if value in retset[var_name].dims
        ]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Example #12
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Example #13
0
def _fetch_solr_json(base_url,
                     query_args,
                     offset=0,
                     limit=3500,
                     timeout=10,
                     monitor: Monitor = Monitor.NONE):
    """
    Return JSON value read from paginated Solr web-service.
    """
    combined_json_dict = None
    num_found = -1
    # we don't know ahead of time how much request are necessary
    with monitor.starting("Loading", 10):
        while True:
            monitor.progress(work=1)
            paging_query_args = dict(query_args or {})
            # noinspection PyArgumentList
            paging_query_args.update(offset=offset,
                                     limit=limit,
                                     format='application/solr+json')
            url = base_url + '?' + urllib.parse.urlencode(paging_query_args)
            try:
                with urllib.request.urlopen(url, timeout=timeout) as response:
                    json_text = response.read()
                    json_dict = json.loads(json_text.decode('utf-8'))
                    if num_found is -1:
                        num_found = json_dict.get('response',
                                                  {}).get('numFound', 0)
                    if not combined_json_dict:
                        combined_json_dict = json_dict
                        if num_found < limit:
                            break
                    else:
                        docs = json_dict.get('response', {}).get('docs', [])
                        combined_json_dict.get('response',
                                               {}).get('docs', []).extend(docs)
                        if num_found < offset + limit:
                            break
            except (urllib.error.HTTPError, urllib.error.URLError) as e:
                raise DataAccessError(
                    "Downloading CCI Open Data Portal index failed: {}\n{}".
                    format(e, base_url)) from e
            except socket.timeout:
                raise DataAccessError(
                    "Downloading CCI Open Data Portal index failed: connection timeout\n{}"
                    .format(base_url))
            offset += limit
    return combined_json_dict
Example #14
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Example #15
0
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    with monitor.observing("Calculating anomaly"):
        ref = ref.mean(keep_attrs=True, skipna=True)
        diff = ds - ref
    return diff
Example #16
0
    def add_local_data_source(self, data_source_id: str, file_path_pattern: str, monitor: Monitor):
        """
        Adds a local data source made up of the specified files.

        :param data_source_id: The identifier of the local data source.
        :param file_path_pattern: The files path containing wildcards.
        :param monitor: a progress monitor.
        :return: JSON-serializable list of 'local' data sources, sorted by name.
        """
        data_store = DATA_STORE_REGISTRY.get_data_store('local')
        if data_store is None:
            raise ValueError('Unknown data store: "%s"' % 'local')
        with monitor.starting('Adding local data source', 100):
            # TODO use monitor, while extracting metadata
            data_store.add_pattern(data_source_id=data_source_id, files=file_path_pattern)
            return self.get_data_sources('local', monitor=monitor.child(100))
Example #17
0
def compute(ds: DatasetLike.TYPE,
            expr: str,
            var: VarName.TYPE,
            copy: bool = False,
            _ctx: dict = None,
            monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """

    :param ds: The primary dataset. If omitted, all variables need to be prefixed by their dataset resource names.
    :param expr: Math expression in which all *ds* variables may be used by name.
    :param var: The new variable's name.
    :param copy: Whether to copy all variables from *ds*.
    :param monitor: An optional progress monitor.
    :return: A new dataset with the new variable.
    """

    if _ctx is not None and 'value_cache' in _ctx:
        local_namespace = dict(_ctx['value_cache'])
    else:
        local_namespace = dict()

    if ds is not None:
        local_namespace.update(ds.data_vars)

    with monitor.observing("Computing variable"):
        data_array = safe_eval(expr, local_namespace=local_namespace)
        data_array.name = var

    if ds is not None and copy:
        new_ds = ds.copy()
        new_ds[var] = data_array
    else:
        new_ds = xr.Dataset(data_vars={var: data_array})
    return new_ds
Example #18
0
def anomaly_internal(ds: xr.Dataset,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly using as reference data the mean of an optional region
    and time slice from the given dataset. If no time slice/spatial region is
    given, the operation will calculate anomaly using the mean of the whole
    dataset as the reference.

    This is done for each data array in the dataset.
    :param ds: The dataset to calculate anomalies from
    :param time_range: Time range to use for reference data
    :param region: Spatial region to use for reference data
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    ref = ds.copy()
    if time_range:
        time_range = TimeRangeLike.convert(time_range)
        ref = subset_temporal(ref, time_range)
    if region:
        region = PolygonLike.convert(region)
        ref = subset_spatial(ref, region)
    with monitor.observing("Calculating anomaly"):
        ref = ref.mean(keep_attrs=True, skipna=True)
        diff = ds - ref
    return diff
Example #19
0
def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
                               ds_y: DatasetLike.TYPE,
                               var_x: VarName.TYPE,
                               var_y: VarName.TYPE,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError('Both datasets should feature the same'
                              ' dimensionality. Currently provided ds_x[var_x] '
                              f'has {array_x.dims}, provided ds_y[var_y]'
                              f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError('All dimensions of both provided data variables'
                                  f' must be the same length. Currently {dim} of ds_x[var_x]'
                                  f' has {len(array_x[dim])} values, while'
                                  f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                                  ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError('There should be no less than 3 values in both data variables'
                              f' to perform the correlation. Currently there are {n_vals} values')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Example #20
0
    def getrecords(self, monitor: Monitor = Monitor.NONE):
        if not self._catalogue_service:
            self._init_service()

        if not self._catalogue:
            self._build_catalogue(monitor.child(1))

        return self._catalogue
Example #21
0
File: index.py Project: whigg/cate
def _generic_index_calculation(
        ds: xr.Dataset,
        var: VarName.TYPE,
        region: PolygonLike.TYPE,
        window: int,
        file: str,
        name: str,
        threshold: float = None,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values,
                          columns=[name],
                          index=ts.time.values)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Example #22
0
def _do_json_rpc(web_socket, rpc_request: dict, monitor: Monitor) -> dict:
    web_socket.write_message(json.dumps(rpc_request))
    work_reported = None
    started = False
    while True and (monitor is None or not monitor.is_cancelled()):
        response_str = yield web_socket.read_message()
        rpc_response = json.loads(response_str)
        if 'progress' in rpc_response:
            if monitor:
                progress = rpc_response['progress']
                total = progress.get('total')
                label = progress.get('label')
                worked = progress.get('worked')
                msg = progress.get('message')

                if not started:
                    monitor.start(label or "start", total_work=total)
                    started = True

                if started:
                    if worked:
                        if work_reported is None:
                            work_reported = 0.0
                        work = worked - work_reported
                        work_reported = worked
                    else:
                        work = None
                    monitor.progress(work=work, msg=msg)
        else:
            if monitor and started:
                monitor.done()
            return rpc_response

    return {}
Example #23
0
def _do_json_rpc(web_socket, rpc_request: dict, monitor: Monitor) -> dict:
    web_socket.write_message(json.dumps(rpc_request))
    work_reported = None
    started = False
    while True and (monitor is None or not monitor.is_cancelled()):
        response_str = yield web_socket.read_message()
        rpc_response = json.loads(response_str)
        if 'progress' in rpc_response:
            if monitor:
                progress = rpc_response['progress']
                total = progress.get('total')
                label = progress.get('label')
                worked = progress.get('worked')
                msg = progress.get('message')

                if not started:
                    monitor.start(label or "start", total_work=total)
                    started = True

                if started:
                    if worked:
                        if work_reported is None:
                            work_reported = 0.0
                        work = worked - work_reported
                        work_reported = worked
                    else:
                        work = None
                    monitor.progress(work=work, msg=msg)
        else:
            if monitor and started:
                monitor.done()
            return rpc_response

    return {}
Example #24
0
def _group_anomaly(group: xr.Dataset,
                   ref: xr.Dataset,
                   monitor: Monitor = Monitor.NONE,
                   step: float = None):
    """
    Calculate anomaly for the given group.

    :param group: Result of a groupby('time.month') operation
    :param ref: Reference dataset
    :param monitor: Monitor of the parent method
    :param step: Step to add to monitor progress
    :return: Group dataset with anomaly calculation applied
    """
    # Retrieve the month of the current group
    month = group['time.month'][0].values
    ret = diff(group, ref.isel(time=month - 1))
    monitor.progress(work=step)
    return ret
Example #25
0
def _group_anomaly(group: xr.Dataset,
                   ref: xr.Dataset,
                   monitor: Monitor = Monitor.NONE,
                   step: float = None):
    """
    Calculate anomaly for the given group.

    :param group: Result of a groupby('time.month') operation
    :param ref: Reference dataset
    :param monitor: Monitor of the parent method
    :param step: Step to add to monitor progress
    :return: Group dataset with anomaly calculation applied
    """
    # Retrieve the month of the current group
    month = group['time.month'][0].values
    ret = diff(group, ref.isel(time=month - 1))
    monitor.progress(work=step)
    return ret
Example #26
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [value for value in dim if value in retset[var_name].dims]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Example #27
0
def tseries_mean(ds: xr.Dataset,
                 var: VarNamesLike.TYPE,
                 std_suffix: str = '_std',
                 calculate_std: bool = True,
                 monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Extract spatial mean timeseries of the provided variables, return the
    dataset that in addition to all the information in the given dataset
    contains also timeseries data for the provided variables, following
    naming convention 'var_name1_ts_mean'

    If a data variable with more dimensions than time/lat/lon is provided,
    the data will be reduced by taking the mean of all data values at a single
    time position resulting in one dimensional timeseries data variable.

    :param ds: The dataset from which to perform timeseries extraction.
    :param var: Variables for which to perform timeseries extraction
    :param calculate_std: Whether to calculate std in addition to mean
    :param std_suffix: Std suffix to use for resulting datasets, if std is calculated.
    :param monitor: a progress monitor.
    :return: Dataset with timeseries variables
    """
    if not var:
        var = '*'

    retset = select_var(ds, var)
    names = retset.data_vars.keys()

    with monitor.starting("Calculate mean", total_work=len(names)):
        for name in names:
            dims = list(ds[name].dims)
            dims.remove('time')
            with monitor.child(1).observing("Calculate mean"):
                retset[name] = retset[name].mean(dim=dims, keep_attrs=True)
            retset[name].attrs[
                'Cate_Description'] = 'Mean aggregated over {} at each point in time.'.format(
                    dims)
            std_name = name + std_suffix
            retset[std_name] = ds[name].std(dim=dims)
            retset[std_name].attrs[
                'Cate_Description'] = 'Accompanying std values for variable \'{}\''.format(
                    name)

    return retset
Example #28
0
def _generic_index_calculation(ds: xr.Dataset,
                               var: VarName.TYPE,
                               region: PolygonLike.TYPE,
                               window: int,
                               file: str,
                               name: str,
                               threshold: float = None,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Example #29
0
def diff(ds: xr.Dataset,
         ds2: xr.Dataset,
         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate the difference of two datasets (ds - ds2). This is done by
    matching variable names in the two datasets against each other and taking
    the difference of matching variables.

    If lat/lon/time extents differ between the datasets, the default behavior
    is to take the intersection of the datasets and run subtraction on that.
    However, broadcasting is possible. E.g. ds(lat/lon/time) - ds(lat/lon) is
    valid. In this case the subtrahend will be stretched to the size of
    ds(lat/lon/time) so that it can be subtracted. This also works if the
    subtrahend is a single time slice of arbitrary temporal position. In this
    case, the time dimension will be squeezed out leaving a lat/lon dataset.

    :param ds: The minuend dataset
    :param ds2: The subtrahend dataset
    :param monitor: a progress monitor.
    :return: The difference dataset
    """
    try:
        # Times do not intersect
        if 0 == len(ds.time - ds2.time) and \
                len(ds.time) == len(ds2.time):  # Times are the same length
            # If the datasets don't intersect in time dimension, a naive difference
            # would return empty data variables. Hence, the time coordinate has to
            # be dropped beforehand
            ds = ds.drop('time')
            ds2 = ds2.drop('time')
            return ds - ds2
    except AttributeError:
        # It is likely that the one operand is a lat/lon array that can be
        # broadcast against the other operand
        pass

    try:
        if 1 == len(ds2.time):
            # The subtrahend is a single time-slice -> squeeze 'time' dimension to
            # be able to broadcast is along minuend
            ds2 = ds2.squeeze('time', drop=True)
    except AttributeError:
        # Doesn't have a time dimension already
        pass
    except TypeError as e:
        if 'unsized object' in str(e):
            # The 'time' variable is a scalar
            pass
        else:
            raise TypeError(str(e))

    with monitor.observing("Subtract datasets"):
        diff = ds - ds2

    return diff
Example #30
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two timeseries and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)):
        raise ValidationError('To calculate simple correlation, both provided'
                              ' datasets should be simple 1d timeseries. To'
                              ' create a map of correlation coefficients, use'
                              ' pearson_correlation operation instead.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError(
            'The length of the time dimension differs between'
            ' the given datasets. Can not perform the calculation'
            ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError(
            'The length of the time dimension should not be less'
            ' than three to run the calculation.')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.values, array_y.values)

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Example #31
0
def diff(ds: xr.Dataset,
         ds2: xr.Dataset,
         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate the difference of two datasets (ds - ds2). This is done by
    matching variable names in the two datasets against each other and taking
    the difference of matching variables.

    If lat/lon/time extents differ between the datasets, the default behavior
    is to take the intersection of the datasets and run subtraction on that.
    However, broadcasting is possible. E.g. ds(lat/lon/time) - ds(lat/lon) is
    valid. In this case the subtrahend will be stretched to the size of
    ds(lat/lon/time) so that it can be subtracted. This also works if the
    subtrahend is a single time slice of arbitrary temporal position. In this
    case, the time dimension will be squeezed out leaving a lat/lon dataset.

    :param ds: The minuend dataset
    :param ds2: The subtrahend dataset
    :param monitor: a progress monitor.
    :return: The difference dataset
    """
    try:
        # Times do not intersect
        if 0 == len(ds.time - ds2.time) and \
                len(ds.time) == len(ds2.time):  # Times are the same length
            # If the datasets don't intersect in time dimension, a naive difference
            # would return empty data variables. Hence, the time coordinate has to
            # be dropped beforehand
            ds = ds.drop('time')
            ds2 = ds2.drop('time')
            return ds - ds2
    except AttributeError:
        # It is likely that the one operand is a lat/lon array that can be
        # broadcast against the other operand
        pass

    try:
        if 1 == len(ds2.time):
            # The subtrahend is a single time-slice -> squeeze 'time' dimension to
            # be able to broadcast is along minuend
            ds2 = ds2.squeeze('time', drop=True)
    except AttributeError:
        # Doesn't have a time dimension already
        pass
    except TypeError as e:
        if 'unsized object' in str(e):
            # The 'time' variable is a scalar
            pass
        else:
            raise TypeError(str(e))

    with monitor.observing("Subtract datasets"):
        diff = ds - ds2

    return diff
Example #32
0
def _lta_monthly(ds: xr.Dataset, monitor: Monitor):
    """
    Carry out a long term average on a monthly dataset

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc)
    time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc)
    total_work = 100
    retset = ds

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),
                                   freq='MS',
                                   periods=12)

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Example #33
0
def _resample_dataset(ds_master: xr.Dataset, ds_replica: xr.Dataset,
                      method_us: int, method_ds: int,
                      monitor: Monitor) -> xr.Dataset:
    """
    Resample replica onto the grid of the master.
    This does spatial resampling the whole dataset, e.g., all
    variables in the replica dataset.
    This method works only if both datasets have (time, lat, lon) dimensions.

    Note that dataset attributes are not propagated due to currently undecided CDM attributes' set.

    :param ds_master: xr.Dataset whose lat/lon coordinates are used as the resampling grid
    :param ds_replica: xr.Dataset that will be resampled on the masters' grid
    :param method_us: Interpolation method for upsampling, see resampling.py
    :param method_ds: Interpolation method for downsampling, see resampling.py
    :param monitor: a progress monitor.
    :return: xr.Dataset The resampled replica dataset
    """
    # Find lat/lon bounds of the intersection of master and replica grids. The
    # bounds should fall on pixel boundaries for both spatial dimensions for
    # both datasets
    lat_min, lat_max = _find_intersection(ds_master['lat'].values,
                                          ds_replica['lat'].values,
                                          global_bounds=(-90, 90))
    lon_min, lon_max = _find_intersection(ds_master['lon'].values,
                                          ds_replica['lon'].values,
                                          global_bounds=(-180, 180))

    # Subset replica dataset and master grid. We're not using here the subset
    # operation, because the subset operation may produce datasets that cross
    # the anti-meridian by design. However, such a disjoint dataset can not be
    # resampled using our current resampling methods.
    lat_slice = slice(lat_min, lat_max)
    lon_slice = slice(lon_min, lon_max)

    lon = ds_master['lon'].sel(lon=lon_slice)
    lat = ds_master['lat'].sel(lat=lat_slice)
    ds_replica = ds_replica.sel(lon=lon_slice, lat=lat_slice)

    # Don't do anything if datasets already have the same spatial definition
    if _grids_equal(ds_master, ds_replica):
        return ds_replica

    with monitor.starting("coregister dataset", len(ds_replica.data_vars)):
        kwargs = {
            'lon': lon,
            'lat': lat,
            'method_us': method_us,
            'method_ds': method_ds,
            'parent_monitor': monitor
        }
        retset = ds_replica.apply(_resample_array, keep_attrs=True, **kwargs)

    return adjust_spatial_attrs(retset)
Example #34
0
def _resample_array(array: xr.DataArray, lon: xr.DataArray, lat: xr.DataArray, method_us: int,
                    method_ds: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample the given xr.DataArray to a new grid defined by lat and lon

    :param array: xr.DataArray with lat,lon and time coordinates
    :param lat: 'lat' xr.DataArray attribute for the new grid
    :param lon: 'lon' xr.DataArray attribute for the new grid
    :param method_us: Interpolation method to use for upsampling, see resampling.py
    :param method_ds: Interpolation method to use for downsampling, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: The resampled array
    """
    # Determine width and height of the resampled array
    width = lon.values.size
    height = lat.values.size

    monitor = parent_monitor.child(1)

    kwargs = {'w': width, 'h': height, 'ds_method': method_ds, 'us_method': method_us, 'parent_monitor': monitor}

    groupby_list = list(array.dims)
    for dim in ['lon', 'lat']:
        groupby_list.remove(dim)

    if 0 == len(groupby_list):
        # a 2d dataset, can't do groupby => do a simple slice resample
        with monitor.starting("coregister dataarray", total_work=1):
            temp_array = _resample_slice(array, **kwargs)
            coords = {'lat': lat, 'lon': lon}
            return xr.DataArray(temp_array.values,
                                name=array.name,
                                dims=array.dims,
                                coords=coords,
                                attrs=array.attrs).chunk()

    num_steps = 1
    for dim in groupby_list:
        num_steps = num_steps * len(array[dim])

    with monitor.starting("coregister dataarray", total_work=num_steps):
        temp_array = _nested_groupby_apply(array, groupby_list, _resample_slice, kwargs)
        chunks = {'lat': height, 'lon': width}
        coords = {'lat': lat, 'lon': lon}
        for dim in groupby_list:
            coords[dim] = array[dim]
            # One spatial slice is one dask chunk, e.g. chunking is
            # (1,1,1..1,len(lat),len(lon))
            chunks[dim] = 1
        return xr.DataArray(temp_array.values,
                            name=array.name,
                            dims=array.dims,
                            coords=coords,
                            attrs=array.attrs).chunk(chunks=chunks)
Example #35
0
def _resample_array(array: xr.DataArray, lon: xr.DataArray, lat: xr.DataArray, method_us: int,
                    method_ds: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample the given xr.DataArray to a new grid defined by lat and lon

    :param array: xr.DataArray with lat,lon and time coordinates
    :param lat: 'lat' xr.DataArray attribute for the new grid
    :param lon: 'lon' xr.DataArray attribute for the new grid
    :param method_us: Interpolation method to use for upsampling, see resampling.py
    :param method_ds: Interpolation method to use for downsampling, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: The resampled array
    """
    # Determine width and height of the resampled array
    width = lon.values.size
    height = lat.values.size

    monitor = parent_monitor.child(1)

    kwargs = {'w': width, 'h': height, 'ds_method': method_ds, 'us_method': method_us, 'parent_monitor': monitor}

    groupby_list = list(array.dims)
    for dim in ['lon', 'lat']:
        groupby_list.remove(dim)

    if 0 == len(groupby_list):
        # a 2d dataset, can't do groupby => do a simple slice resample
        with monitor.starting("coregister dataarray", total_work=1):
            temp_array = _resample_slice(array, **kwargs)
            coords = {'lat': lat, 'lon': lon}
            return xr.DataArray(temp_array.values,
                                name=array.name,
                                dims=array.dims,
                                coords=coords,
                                attrs=array.attrs).chunk()

    num_steps = 1
    for dim in groupby_list:
        num_steps = num_steps * len(array[dim])

    with monitor.starting("coregister dataarray", total_work=num_steps):
        temp_array = _nested_groupby_apply(array, groupby_list, _resample_slice, kwargs)
        chunks = {'lat': height, 'lon': width}
        coords = {'lat': lat, 'lon': lon}
        for dim in groupby_list:
            coords[dim] = array[dim]
            # One spatial slice is one dask chunk, e.g. chunking is
            # (1,1,1..1,len(lat),len(lon))
            chunks[dim] = 1
        return xr.DataArray(temp_array.values,
                            name=array.name,
                            dims=array.dims,
                            coords=coords,
                            attrs=array.attrs).chunk(chunks=chunks)
Example #36
0
File: io.py Project: whigg/cate
def save_dataset(ds: xr.Dataset, file: str, format: str = None, monitor: Monitor = Monitor.NONE):
    """
    Save a dataset to NetCDF file.

    :param ds: The dataset
    :param file: File path
    :param format: NetCDF format flavour, one of 'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'.
    :param monitor: a progress monitor.
    """
    with monitor.observing("save_dataset"):
        ds.to_netcdf(file, format=format)
Example #37
0
def _lta_monthly(ds: xr.Dataset, monitor: Monitor):
    """
    Carry out a long term average on a monthly dataset

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc)
    time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc)
    total_work = 100
    retset = ds

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month', squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),
                                   freq='MS',
                                   periods=12)

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Example #38
0
def no_op(num_steps: int = 10,
          step_duration: float = 0.5,
          fail_before: bool = False,
          fail_after: bool = False,
          monitor: Monitor = Monitor.NONE) -> bool:
    """
    An operation that basically does nothing but spending configurable time.
    It may be useful for testing purposes.

    :param num_steps: Number of steps to iterate.
    :param step_duration: How much time to spend in each step in seconds.
    :param fail_before: If the operation should fail before spending time doing nothing.
    :param fail_after: If the operation should fail after spending time doing nothing.
    :param monitor: A progress monitor.
    :return: Always True
    """
    import time
    monitor.start('Computing nothing', num_steps)
    if fail_before:
        raise ValueError('Intentionally failed before doing anything.')
    for i in range(num_steps):
        time.sleep(step_duration)
        monitor.progress(1.0,
                         'Step %s of %s doing nothing' % (i + 1, num_steps))
    if fail_after:
        raise ValueError('Intentionally failed after doing nothing.')
    monitor.done()
    return True
Example #39
0
def _fetch_solr_json(base_url,
                     query_args,
                     offset=0,
                     limit=3500,
                     timeout=10,
                     monitor: Monitor = Monitor.NONE):
    """
    Return JSON value read from paginated Solr web-service.
    """
    combined_json_dict = None
    num_found = -1
    # we don't know ahead of time how much request are necessary
    with monitor.starting("Loading", 10):
        while True:
            monitor.progress(work=1)
            if monitor.is_cancelled():
                raise InterruptedError
            paging_query_args = dict(query_args or {})
            paging_query_args.update(offset=offset,
                                     limit=limit,
                                     format='application/solr+json')
            url = base_url + '?' + urllib.parse.urlencode(paging_query_args)
            with urllib.request.urlopen(url, timeout=timeout) as response:
                json_text = response.read()
                json_dict = json.loads(json_text.decode('utf-8'))
                if num_found is -1:
                    num_found = json_dict.get('response',
                                              {}).get('numFound', 0)
                if not combined_json_dict:
                    combined_json_dict = json_dict
                    if num_found < limit:
                        break
                else:
                    docs = json_dict.get('response', {}).get('docs', [])
                    combined_json_dict.get('response', {}).get('docs',
                                                               []).extend(docs)
                    if num_found < offset + limit:
                        break
            offset += limit
    return combined_json_dict
Example #40
0
def _exec_script(script: str,
                 element_types: Tuple[type, ...],
                 operation_context: Mapping[str, Any] = None,
                 context_object: Mapping[str, Any] = None,
                 monitor: Monitor = Monitor.NONE) -> Dict[str, Any]:
    """
    Helper for compute_dataset() and compute_data_frame().
    """
    if not script:
        raise ValidationError(f'Python script must not be empty')

    # Include common libraries
    orig_namespace = dict(
        gpd=gpd,
        geopandas=geopandas,
        math=math,
        np=np,
        numpy=numpy,
        pd=pd,
        pandas=pandas,
        sp=sp,
        scipy=scipy,
        xr=xr,
        xarray=xarray,
    )

    if operation_context is not None and 'value_cache' in operation_context:
        orig_namespace.update(operation_context['value_cache'])

    if context_object is not None:
        orig_namespace.update(context_object)

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        try:
            safe_exec(script, local_namespace=local_namespace)
        except BaseException as e:
            raise ValidationError(f'Error in Python script: {e}') from e

    elements = dict()
    for name, element in local_namespace.items():
        if not name.startswith('_'):
            if isinstance(element, element_types):
                if name not in orig_namespace or element is not orig_namespace[
                        name]:
                    elements[name] = element

    return elements
Example #41
0
def _exec_script(script: str,
                 element_types: Tuple[type, ...],
                 operation_context: Mapping[str, Any] = None,
                 context_object: Mapping[str, Any] = None,
                 monitor: Monitor = Monitor.NONE) -> Dict[str, Any]:
    """
    Helper for compute_dataset() and compute_data_frame().
    """
    if not script:
        raise ValidationError(f'Python script must not be empty')

    # Include common libraries
    orig_namespace = dict(
        gpd=gpd,
        geopandas=geopandas,
        math=math,
        np=np,
        numpy=numpy,
        pd=pd,
        pandas=pandas,
        sp=sp,
        scipy=scipy,
        xr=xr,
        xarray=xarray,
    )

    if operation_context is not None and 'value_cache' in operation_context:
        orig_namespace.update(operation_context['value_cache'])

    if context_object is not None:
        orig_namespace.update(context_object)

    local_namespace = dict(orig_namespace)

    with monitor.observing("Executing script"):
        try:
            safe_exec(script, local_namespace=local_namespace)
        except BaseException as e:
            raise ValidationError(f'Error in Python script: {e}') from e

    elements = dict()
    for name, element in local_namespace.items():
        if not name.startswith('_'):
            if isinstance(element, element_types):
                if name not in orig_namespace or element is not orig_namespace[name]:
                    elements[name] = element

    return elements
Example #42
0
def _resample_dataset(ds_master: xr.Dataset, ds_replica: xr.Dataset, method_us: int, method_ds: int, monitor: Monitor) -> xr.Dataset:
    """
    Resample replica onto the grid of the master.
    This does spatial resampling the whole dataset, e.g., all
    variables in the replica dataset.
    This method works only if both datasets have (time, lat, lon) dimensions.

    Note that dataset attributes are not propagated due to currently undecided CDM attributes' set.

    :param ds_master: xr.Dataset whose lat/lon coordinates are used as the resampling grid
    :param ds_replica: xr.Dataset that will be resampled on the masters' grid
    :param method_us: Interpolation method for upsampling, see resampling.py
    :param method_ds: Interpolation method for downsampling, see resampling.py
    :param monitor: a progress monitor.
    :return: xr.Dataset The resampled replica dataset
    """
    # Find lat/lon bounds of the intersection of master and replica grids. The
    # bounds should fall on pixel boundaries for both spatial dimensions for
    # both datasets
    lat_min, lat_max = _find_intersection(ds_master['lat'].values,
                                          ds_replica['lat'].values,
                                          global_bounds=(-90, 90))
    lon_min, lon_max = _find_intersection(ds_master['lon'].values,
                                          ds_replica['lon'].values,
                                          global_bounds=(-180, 180))

    # Subset replica dataset and master grid. We're not using here the subset
    # operation, because the subset operation may produce datasets that cross
    # the anti-meridian by design. However, such a disjoint dataset can not be
    # resampled using our current resampling methods.
    lat_slice = slice(lat_min, lat_max)
    lon_slice = slice(lon_min, lon_max)

    lon = ds_master['lon'].sel(lon=lon_slice)
    lat = ds_master['lat'].sel(lat=lat_slice)
    ds_replica = ds_replica.sel(lon=lon_slice, lat=lat_slice)

    # Don't do anything if datasets already have the same spatial definition
    if _grids_equal(ds_master, ds_replica):
        return ds_replica

    with monitor.starting("coregister dataset", len(ds_replica.data_vars)):
        kwargs = {'lon': lon, 'lat': lat, 'method_us': method_us, 'method_ds': method_ds, 'parent_monitor': monitor}
        retset = ds_replica.apply(_resample_array, keep_attrs=True, **kwargs)

    return adjust_spatial_attrs(retset)
Example #43
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform monthly aggregation of a daily dataset according to the given
    method.

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a daily dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1D':
            raise ValueError(
                'Temporal aggregation operation expects a daily dataset')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    with monitor.observing("resample dataset"):
        retset = ds.resample(freq='MS',
                             dim='time',
                             keep_attrs=True,
                             how=method)

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + \
                    ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Example #44
0
def _resample_slice(arr_slice: xr.DataArray, w: int, h: int, ds_method: int,
                    us_method: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample a single time slice of a larger xr.DataArray

    :param arr_slice: xr.DataArray single slice
    :param w: The desired new width (amount of longitudes)
    :param h: The desired new height (amount of latitudes)
    :param ds_method: Downsampling method, see resampling.py
    :param us_method: Upsampling method, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: resampled slice
    """
    monitor = parent_monitor.child(1)
    with monitor.observing("resample slice"):
        result = resampling.resample_2d(np.ma.masked_invalid(arr_slice.values),
                                        w, h, ds_method, us_method)
        return xr.DataArray(result)
Example #45
0
    def _build_catalogue(self, monitor: Monitor = Monitor.NONE):

        self._catalogue = {}

        catalogue_metadata = {}

        start_position = 0
        max_records = _CSW_MAX_RESULTS

        matches = -1
        while True:
            # fetch record metadata
            self._catalogue_service.getrecords2(esn='full', outputschema=self._namespaces.get_namespace('gmd'),
                                                startposition=start_position, maxrecords=max_records)
            if matches == -1:
                # set counters, start progress monitor
                matches = self._catalogue_service.results.get('matches')
                if matches == 0:
                    break
                monitor.start(label="Fetching catalogue data... (%d records)" % matches,
                              total_work=ceil(matches / max_records))

            catalogue_metadata.update(self._catalogue_service.records)
            monitor.progress(work=1)

            # bump counters
            start_position += max_records
            if start_position > matches:
                break

        self._catalogue = {
            record.identification.uricode[0]: {
                'abstract': record.identification.abstract,
                'bbox_minx': record.identification.bbox.minx if record.identification.bbox else None,
                'bbox_miny': record.identification.bbox.miny if record.identification.bbox else None,
                'bbox_maxx': record.identification.bbox.maxx if record.identification.bbox else None,
                'bbox_maxy': record.identification.bbox.maxy if record.identification.bbox else None,
                'creation_date':
                    next(iter(e.date for e in record.identification.date if e and e.type == 'creation'), None),
                'publication_date':
                    next(iter(e.date for e in record.identification.date if e and e.type == 'publication'), None),
                'title': record.identification.title,
                'data_sources': record.identification.uricode[1:],
                'licences': record.identification.uselimitation,
                'temporal_coverage_start': record.identification.temporalextent_start,
                'temporal_coverage_end': record.identification.temporalextent_end
            }
            for record in catalogue_metadata.values()
            if record.identification and len(record.identification.uricode) > 0
        }
        monitor.done()
Example #46
0
def _resample_slice(arr_slice: xr.DataArray, w: int, h: int, ds_method: int, us_method: int,
                    parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample a single time slice of a larger xr.DataArray

    :param arr_slice: xr.DataArray single slice
    :param w: The desired new width (amount of longitudes)
    :param h: The desired new height (amount of latitudes)
    :param ds_method: Downsampling method, see resampling.py
    :param us_method: Upsampling method, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: resampled slice
    """
    monitor = parent_monitor.child(1)
    with monitor.observing("resample slice"):
        # In some cases the grouped dimension is not automatically squeezed out
        result = resampling.resample_2d(np.ma.masked_invalid(arr_slice.squeeze().values),
                                        w,
                                        h,
                                        ds_method,
                                        us_method)
        return xr.DataArray(result)
Example #47
0
def plot_hovmoeller(ds: xr.Dataset,
                    var: VarName.TYPE = None,
                    x_axis: DimName.TYPE = None,
                    y_axis: DimName.TYPE = None,
                    method: str = 'mean',
                    contour: bool = True,
                    title: str = None,
                    file: str = None,
                    monitor: Monitor = Monitor.NONE,
                    **kwargs) -> Figure:
    """
    Create a Hovmoeller plot of the given dataset. Dimensions other than
    the ones defined as x and y axis will be aggregated using the given
    method to produce the plot.

    :param ds: Dataset to plot
    :param var: Name of the variable to plot
    :param x_axis: Dimension to show on x axis
    :param y_axis: Dimension to show on y axis
    :param method: Aggregation method
    :param contour: Whether to produce a contour plot
    :param title: Plot title
    :param file: path to a file in which to save the plot
    :param monitor: A progress monitor
    :param kwargs: Keyword arguments to pass to underlying xarray plotting fuction
    """
    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)
    var = ds[var_name]

    if not x_axis:
        x_axis = var.dims[0]
    else:
        x_axis = DimName.convert(x_axis)

    if not y_axis:
        try:
            y_axis = var.dims[1]
        except IndexError:
            raise ValidationError('Given dataset variable should have at least two dimensions.')
    else:
        y_axis = DimName.convert(y_axis)

    if x_axis == y_axis:
        raise ValidationError('Dimensions should differ between plot axis.')

    dims = list(var.dims)
    try:
        dims.remove(x_axis)
        dims.remove(y_axis)
    except ValueError:
        raise ValidationError('Given dataset variable: {} does not feature requested dimensions:\
 {}, {}.'.format(var_name, x_axis, y_axis))

    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    with monitor.starting("Plot Hovmoeller", total_work=100):
        monitor.progress(5)
        with monitor.child(90).observing("Aggregate"):
            var = var.reduce(ufuncs[method], dim=dims)
        monitor.progress(5)

    figure = plt.figure()
    ax = figure.add_subplot(111)
    if x_axis == 'time':
        figure.autofmt_xdate()

    if contour:
        var.plot.contourf(ax=ax, x=x_axis, y=y_axis, **kwargs)
    else:
        var.plot.pcolormesh(ax=ax, x=x_axis, y=y_axis, **kwargs)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Example #48
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Temporal aggregation operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution of input dataset.'
                              ' Running the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq, keep_attrs=True))
        except AttributeError:
            raise ValidationError(f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(method)

    return adjust_temporal_attrs(retset)
Example #49
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Example #50
0
def anomaly_external(ds: xr.Dataset,
                     file: str,
                     transform: str = None,
                     monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Calculate anomaly with external reference data, for example, a climatology.
    The given reference dataset is expected to consist of 12 time slices, one
    for each month.

    The returned dataset will contain the variable names found in both - the
    reference and the given dataset. Names found in the given dataset, but not in
    the reference, will be dropped from the resulting dataset. The calculated
    anomaly will be against the corresponding month of the reference data.
    E.g. January against January, etc.

    In case spatial extents differ between the reference and the given dataset,
    the anomaly will be calculated on the intersection.

    :param ds: The dataset to calculate anomalies from
    :param file: Path to reference data file
    :param transform: Apply the given transformation before calculating the anomaly.
                      For supported operations see help on 'ds_arithmetics' operation.
    :param monitor: a progress monitor.
    :return: The anomaly dataset
    """
    # Check if the time coordinate is of dtype datetime
    try:
        if ds.time.dtype != 'datetime64[ns]':
            raise ValidationError('The dataset provided for anomaly calculation'
                                  ' is required to have a time coordinate of'
                                  ' dtype datetime64[ns]. Running the normalize'
                                  ' operation on this dataset might help.')
    except AttributeError:
        raise ValidationError('The dataset provided for anomaly calculation'
                              ' is required to have a time coordinate.')

    try:
        if ds.attrs['time_coverage_resolution'] != 'P1M':
            raise ValidationError('anomaly_external expects a monthly dataset'
                                  ' got: {} instead.'.format(ds.attrs['time_coverate_resolution']))
    except KeyError:
        try:
            ds = adjust_temporal_attrs(ds)
            if ds.attrs['time_coverage_resolution'] != 'P1M':
                raise ValidationError('anomaly_external expects a monthly dataset'
                                      ' got: {} instead.'.format(ds.attrs['time_coverate_resolution']))
        except KeyError:
            raise ValidationError('Could not determine temporal resolution of'
                                  ' of the given input dataset.')

    clim = xr.open_dataset(file)
    try:
        if len(clim.time) != 12:
            raise ValidationError('The reference dataset is expected to be a '
                                  'monthly climatology. The provided dataset has'
                                  ' a time dimension with length: {}'.format(len(clim.time)))
    except AttributeError:
        raise ValidationError('The reference dataset is required to '
                              'have a time coordinate.')

    ret = ds.copy()
    if transform:
        ret = ds_arithmetics(ds, transform)
    # Group by months, subtract the appropriate slice from the reference
    # Note that this requires that 'time' coordinate labels are of type
    # datetime64[ns]
    total_work = 100
    step = 100 / 12

    with monitor.starting('Anomaly', total_work=total_work):
        monitor.progress(work=0)
        kwargs = {'ref': clim, 'monitor': monitor, 'step': step}
        ret = ret.groupby(ds['time.month']).apply(_group_anomaly,
                                                  **kwargs)

    # Running groupby results in a redundant 'month' variable being added to
    # the dataset
    ret = ret.drop('month')
    ret.attrs = ds.attrs
    # The dataset may be cropped
    return adjust_spatial_attrs(ret)
Example #51
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath)

            remote_absolute_filepath = os.path.join(self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        remote_dataset = None
                        try:
                            remote_dataset = xr.open_dataset(remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop(
                                    [var_name for var_name in remote_dataset.data_vars.keys()
                                     if var_name not in var_names])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                           allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                                    local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                                    local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                                    local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys():
                                    remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once and remote_dataset is not None:
                                variables_info = local_ds.meta_info.get('variables', [])
                                local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                                   if var_info.get('name')
                                                                   in remote_dataset.variables.keys()
                                                                   and var_info.get('name')
                                                                   not in remote_dataset.dims.keys()]
                                # noinspection PyUnusedLocal
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(os.path.join(local_id, file_name),
                                                 (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath, local_absolute_filepath)
                        local_ds.add_dataset(local_relative_filepath, (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Example #52
0
 def f(monitor: Monitor, x, a=4):
     monitor.start('f', 23)
     return_value = a * x
     monitor.done()
     return return_value
Example #53
0
def data_frame_find_closest(gdf: gpd.GeoDataFrame,
                            location: GeometryLike.TYPE,
                            max_results: int = 1,
                            max_dist: float = 180,
                            dist_col_name: str = 'distance',
                            monitor: Monitor = Monitor.NONE) -> gpd.GeoDataFrame:
    """
    Find the *max_results* records closest to given *location* in the given GeoDataFrame *gdf*.
    Return a new GeoDataFrame containing the closest records.

    If *dist_col_name* is given, store the actual distances in this column.

    Distances are great-circle distances measured in degrees from a representative center of
    the given *location* geometry to the representative centres of each geometry in the *gdf*.

    :param gdf: The GeoDataFrame.
    :param location: A location given as arbitrary geometry.
    :param max_results: Maximum number of results.
    :param max_dist: Ignore records whose distance is greater than this value in degrees.
    :param dist_col_name: Optional name of a new column that will store the actual distances.
    :param monitor: A progress monitor.
    :return: A new GeoDataFrame containing the closest records.
    """
    location = GeometryLike.convert(location)
    location_point = location.representative_point()

    target_crs = dict(init='epsg:4326')
    try:
        source_crs = gdf.crs or target_crs
    except AttributeError:
        source_crs = target_crs
    reprojection_func = _get_reprojection_func(source_crs, target_crs)

    try:
        geometries = gdf.geometry
    except AttributeError as e:
        raise ValidationError('Missing default geometry column in data frame.') from e

    num_rows = len(geometries)
    indexes = list()

    # PERF: Note, this operation may be optimized by computing the great-circle distances using numpy array math!

    total_work = 100
    num_work_rows = 1 + num_rows // total_work
    with monitor.starting('Finding closest records', total_work):
        for i in range(num_rows):
            geometry = geometries.iloc[i]
            if geometry is not None:
                # noinspection PyBroadException
                try:
                    representative_point = geometry.representative_point()
                except BaseException:
                    # For some geometries shapely.representative_point() raises AttributeError or ValueError.
                    # E.g. features that span the poles will raise ValueError.
                    # The quick and dirty solution here is to catch such exceptions and ignore them.
                    representative_point = None
                if representative_point is not None:
                    representative_point = _transform_coordinates(representative_point, reprojection_func)
                    if representative_point is not None:
                        # noinspection PyTypeChecker
                        dist = great_circle_distance(location_point, representative_point)
                        if dist <= max_dist:
                            indexes.append((i, dist))
            if i % num_work_rows == 0:
                monitor.progress(work=1)

    indexes = sorted(indexes, key=lambda item: item[1])
    num_results = min(max_results, len(indexes))
    indexes, distances = zip(*indexes[0:num_results])

    new_gdf = gdf.iloc[list(indexes)]
    if not isinstance(new_gdf, gpd.GeoDataFrame):
        new_gdf = gpd.GeoDataFrame(new_gdf, crs=source_crs)

    if dist_col_name:
        new_gdf[dist_col_name] = np.array(distances)

    return new_gdf
Example #54
0
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset:
    """
    Calculate Pearson correlation coefficients and p-values for testing
    non-correlation of lon/lat/time xarray datasets for each lon/lat point.

    Heavily influenced by scipy.stats.pearsonr

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed, and not necessarily zero-mean.
    Like other correlation coefficients, this one varies between -1 and +1
    with 0 implying no correlation. Correlations of -1 or +1 imply an exact
    linear relationship. Positive correlations imply that as x increases, so
    does y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    :param x: lon/lat/time xr.DataArray
    :param y: xr.DataArray of the same spatiotemporal extents and resolution as x.
    :param monitor: Monitor to use for monitoring the calculation
    :return: A dataset containing the correlation coefficients and p_values on
    the lon/lat grid of x and y.

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
    """
    with monitor.starting("Calculate Pearson correlation", total_work=6):
        n = len(x['time'])

        xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
        xm.time.values = [i for i in range(0, len(xm.time))]
        ym.time.values = [i for i in range(0, len(ym.time))]
        xm_ym = xm * ym
        r_num = xm_ym.sum(dim='time')
        xm_squared = np.square(xm)
        ym_squared = np.square(ym)
        r_den = np.sqrt(xm_squared.sum(dim='time') * ym_squared.sum(dim='time'))
        r_den = r_den.where(r_den != 0)
        r = r_num / r_den

        # Presumably, if abs(r) > 1, then it is only some small artifact of floating
        # point arithmetic.
        # At this point r should be a lon/lat dataArray, so it should be safe to
        # load it in memory explicitly. This may take time as it will kick-start
        # deferred processing.
        # Comparing with NaN produces warnings that can be safely ignored
        default_warning_settings = np.seterr(invalid='ignore')
        with monitor.child(1).observing("task 1"):
            negativ_r = r.values < -1.0
        with monitor.child(1).observing("task 2"):
            r.values[negativ_r] = -1.0
        with monitor.child(1).observing("task 3"):
            positiv_r = r.values > 1.0
        with monitor.child(1).observing("task 4"):
            r.values[positiv_r] = 1.0
        np.seterr(**default_warning_settings)
        r.attrs = {'description': 'Correlation coefficients between'
                   ' {} and {}.'.format(x.name, y.name)}

        df = n - 2
        t_squared = np.square(r) * (df / ((1.0 - r.where(r != 1)) * (1.0 + r.where(r != -1))))

        prob = df / (df + t_squared)
        with monitor.child(1).observing("task 5"):
            prob_values_in = prob.values
        with monitor.child(1).observing("task 6"):
            prob.values = betainc(0.5 * df, 0.5, prob_values_in)
        prob.attrs = {'description': 'Rough indicator of probability of an'
                      ' uncorrelated system producing datasets that have a Pearson'
                      ' correlation at least as extreme as the one computed from'
                      ' these datsets. Not entirely reliable, but reasonable for'
                      ' datasets larger than 500 or so.'}

        retset = xr.Dataset({'corr_coef': r,
                             'p_value': prob})
    return retset
Example #55
0
def data_frame_aggregate(df: DataFrameLike.TYPE,
                         var_names: VarNamesLike.TYPE = None,
                         aggregate_geometry: bool = False,
                         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Aggregate columns into count, mean, median, sum, std, min, and max. Return a
    new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of
    the GeoDataFrame are to be aggregated. All geometries are merged union-like.

    The return data type will always be the same as the input data type.

    :param df: The (Geo)DataFrame to be analysed
    :param var_names: Variables to be aggregated ('None' uses all aggregatable columns)
    :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame
    :param monitor: Monitor for progress bar
    :return: returns either DataFrame or GeoDataFrame. Keeps input data type
    """
    vns = VarNamesLike.convert(var_names)

    df_is_geo = isinstance(df, gpd.GeoDataFrame)
    aggregations = ["count", "mean", "median", "sum", "std", "min", "max"]

    # Check var names integrity (aggregatable, exists in data frame)
    types_accepted_for_agg = ['float64', 'int64', 'bool']
    agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns)

    if df_is_geo:
        agg_columns.append('geometry')

    columns = list(df.columns)

    if vns is None:
        vns = agg_columns

    diff = list(set(vns) - set(columns))
    if len(diff) > 0:
        raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!')

    diff = list(set(vns) - set(agg_columns))
    if len(diff) > 0:
        raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!')

    try:
        df['geometry']
    except KeyError as e:
        raise ValidationError('Variable geometry not in GEO data frame!') from e

    # Aggregate columns
    if vns is None:
        df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations)
    else:
        df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations)

    res = {}
    for n in df_buff.columns:
        for a in aggregations:
            val = df_buff[n][a]
            h = n + '_' + a
            res[h] = [val]

    df_agg = pd.DataFrame(res)

    # Aggregate (union) geometry if GeoDataFrame
    if df_is_geo and aggregate_geometry:
        total_work = 100
        num_work_rows = 1 + len(df) // total_work
        with monitor.starting('Aggregating geometry: ', total_work):
            multi_polygon = shapely.geometry.MultiPolygon()
            i = 0
            for rec in df.geometry:
                if monitor.is_cancelled():
                    break
                # noinspection PyBroadException
                try:
                    multi_polygon = multi_polygon.union(other=rec)
                except Exception:
                    pass

                if i % num_work_rows == 0:
                    monitor.progress(work=1)
                i += 1

        df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs)

    return df_agg
Example #56
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution of input dataset.'
            ' Running the adjust_temporal_attrs operation beforehand may'
            ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq,
                                                            keep_attrs=True))
        except AttributeError:
            raise ValidationError(
                f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Example #57
0
def animate_map(ds: xr.Dataset,
                var: VarName.TYPE = None,
                animate_dim: str = 'time',
                interval: int = 200,
                true_range: bool = False,
                indexers: DictLike.TYPE = None,
                region: PolygonLike.TYPE = None,
                projection: str = 'PlateCarree',
                central_lon: float = 0.0,
                title: str = None,
                contour_plot: bool = False,
                cmap_params: DictLike.TYPE = None,
                plot_properties: DictLike.TYPE = None,
                file: str = None,
                monitor: Monitor = Monitor.NONE) -> HTML:
    """
    Create a geographic map animation for the variable given by dataset *ds* and variable name *var*.

    Creates an animation of the given variable from the given dataset on a map with coastal lines.
    In case no variable name is given, the first encountered variable in the
    dataset is animated.
    It is also possible to set extents of the animation. If no extents
    are given, a global animation is created.

    The following file formats for saving the animation are supported: html

    :param ds: the dataset containing the variable to animate
    :param var: the variable's name
    :param animate_dim: Dimension to animate, if none given defaults to time.
    :param interval: Delay between frames in milliseconds. Defaults to 200.
    :param true_range: If True, calculates colormap and colorbar configuration parameters from the
    whole dataset. Can potentially take a lot of time. Defaults to False, in which case the colormap
    is calculated from the first frame.
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param region: Region to animate
    :param projection: name of a global projection, see http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html
    :param central_lon: central longitude of the projection in degrees
    :param title: an optional title
    :param contour_plot: If true plot a filled contour plot of data, otherwise plots a pixelated colormesh
    :param cmap_params: optional additional colormap configuration parameters,
           e.g. "vmax=300, cmap='magma'"
           For full reference refer to
           http://xarray.pydata.org/en/stable/generated/xarray.plot.contourf.html
    :param plot_properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5)"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.contourf.html
    :param file: path to a file in which to save the animation
    :param monitor: A progress monitor.
    :return: An animation in HTML format
    """
    if not isinstance(ds, xr.Dataset):
        raise NotImplementedError('Only gridded datasets are currently supported')

    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)

    try:
        var = ds[var_name]
    except KeyError:
        raise ValidationError('Provided variable name "{}" does not exist in the given dataset'.format(var_name))

    indexers = DictLike.convert(indexers) or {}
    properties = DictLike.convert(plot_properties) or {}
    cmap_params = DictLike.convert(cmap_params) or {}

    extents = None
    bounds = handle_plot_polygon(region)
    if bounds:
        lon_min, lat_min, lon_max, lat_max = bounds
        extents = [lon_min, lon_max, lat_min, lat_max]

    if len(ds.lat) < 2 or len(ds.lon) < 2:
        # Matplotlib can not plot datasets with less than these dimensions with
        # contourf and pcolormesh methods
        raise ValidationError('The minimum dataset spatial dimensions to create a map'
                              ' plot are (2,2)')

    # See http://scitools.org.uk/cartopy/docs/v0.15/crs/projections.html#
    if projection == 'PlateCarree':
        proj = ccrs.PlateCarree(central_longitude=central_lon)
    elif projection == 'LambertCylindrical':
        proj = ccrs.LambertCylindrical(central_longitude=central_lon)
    elif projection == 'Mercator':
        proj = ccrs.Mercator(central_longitude=central_lon)
    elif projection == 'Miller':
        proj = ccrs.Miller(central_longitude=central_lon)
    elif projection == 'Mollweide':
        proj = ccrs.Mollweide(central_longitude=central_lon)
    elif projection == 'Orthographic':
        proj = ccrs.Orthographic(central_longitude=central_lon)
    elif projection == 'Robinson':
        proj = ccrs.Robinson(central_longitude=central_lon)
    elif projection == 'Sinusoidal':
        proj = ccrs.Sinusoidal(central_longitude=central_lon)
    elif projection == 'NorthPolarStereo':
        proj = ccrs.NorthPolarStereo(central_longitude=central_lon)
    elif projection == 'SouthPolarStereo':
        proj = ccrs.SouthPolarStereo(central_longitude=central_lon)
    else:
        raise ValidationError('illegal projection: "%s"' % projection)

    figure = plt.figure(figsize=(8, 4))
    ax = plt.axes(projection=proj)
    if extents:
        ax.set_extent(extents, ccrs.PlateCarree())
    else:
        ax.set_global()

    ax.coastlines()

    if not animate_dim:
        animate_dim = 'time'

    indexers[animate_dim] = var[animate_dim][0]

    var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat'))

    with monitor.starting("animate", len(var[animate_dim]) + 3):
        if true_range:
            data_min, data_max = _get_min_max(var, monitor=monitor)
        else:
            data_min, data_max = _get_min_max(var_data, monitor=monitor)

        cmap_params = determine_cmap_params(data_min, data_max, **cmap_params)
        plot_kwargs = {**properties, **cmap_params}

        # Plot the first frame to set-up the axes with the colorbar properly
        # transform keyword is for the coordinate our data is in, which in case of a
        # 'normal' lat/lon dataset is PlateCarree.
        if contour_plot:
            var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                   add_colorbar=True, **plot_kwargs)
        else:
            var_data.plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                     add_colorbar=True, **plot_kwargs)
        if title:
            ax.set_title(title)
        figure.tight_layout()
        monitor.progress(1)

        def run(value):
            ax.clear()
            if extents:
                ax.set_extent(extents, ccrs.PlateCarree())
            else:
                ax.set_global()
            ax.coastlines()
            indexers[animate_dim] = value
            var_data = get_var_data(var, indexers, remaining_dims=('lon', 'lat'))
            var_data.plot.contourf(ax=ax, transform=ccrs.PlateCarree(), subplot_kws={'projection': proj},
                                   add_colorbar=False, **plot_kwargs)
            if title:
                ax.set_title(title)
            monitor.progress(1)
            return ax
        anim = animation.FuncAnimation(figure, run, [i for i in var[animate_dim]],
                                       interval=interval, blit=False, repeat=False)
        anim_html = anim.to_jshtml()

        # Prevent the animation for running after it's finished
        del anim

        # Delete the rogue temp-file
        try:
            os.remove('None0000000.png')
        except FileNotFoundError:
            pass

        if file:
            with open(file, 'w') as outfile:
                outfile.write(anim_html)
                monitor.progress(1)

    return HTML(anim_html)
Example #58
0
def _lta_general(ds: xr.Dataset, monitor: Monitor):
    """
    Try to carry out a long term average in a general case, notably
    in the case of having seasonal datasets

    :param ds: Dataset to aggregate
    :param monitor: Progress monitor
    :return: Aggregated dataset
    """
    time_min = pd.Timestamp(ds.time.values[0], tzinfo=timezone.utc)
    time_max = pd.Timestamp(ds.time.values[-1], tzinfo=timezone.utc)
    total_work = 100
    retset = ds

    # The dataset should feature time periods consistent over years
    # and denoted with the same dates each year
    if not _is_seasonal(ds.time):
        raise ValidationError("A long term average dataset can not be created for"
                              " a dataset with inconsistent seasons.")

    # Get 'representative year'
    c = 0
    for group in ds.time.groupby('time.year'):
        c = c + 1
        if c == 1:
            rep_year = group[1].time
            continue
        if c == 2 and len(group[1].time) > len(rep_year):
            rep_year = group[1].time
            break

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / len(rep_year.time)
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month', squeeze=False).apply(_groupby_day, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.stack(time=('month', 'day'))

    # Turn month, day coordinates to time
    retset = retset.reset_index('time')
    retset = retset.drop(['month', 'day'])
    retset['time'] = rep_year.time

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (len(rep_year), 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset