Ejemplo n.º 1
0
    def getrecords(self, monitor: Monitor = Monitor.NONE):
        if not self._catalogue_service:
            self._init_service()

        if not self._catalogue:
            self._build_catalogue(monitor.child(1))

        return self._catalogue
Ejemplo n.º 2
0
Archivo: index.py Proyecto: whigg/cate
def _generic_index_calculation(
        ds: xr.Dataset,
        var: VarName.TYPE,
        region: PolygonLike.TYPE,
        window: int,
        file: str,
        name: str,
        threshold: float = None,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values,
                          columns=[name],
                          index=ts.time.values)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Ejemplo n.º 3
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Ejemplo n.º 4
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Ejemplo n.º 5
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = xu.log(retset)
                elif item[:] == 'log10':
                    retset = xu.log10(retset)
                elif item[:] == 'log2':
                    retset = xu.log2(retset)
                elif item[:] == 'log1p':
                    retset = xu.log1p(retset)
                elif item[:] == 'exp':
                    retset = xu.exp(retset)
                else:
                    raise ValueError('Arithmetic operation {} not'
                                     ' implemented.'.format(item[0]))

    return retset
Ejemplo n.º 6
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = np.log(retset)
                elif item[:] == 'log10':
                    retset = np.log10(retset)
                elif item[:] == 'log2':
                    retset = np.log2(retset)
                elif item[:] == 'log1p':
                    retset = np.log1p(retset)
                elif item[:] == 'exp':
                    retset = np.exp(retset)
                else:
                    raise ValidationError('Arithmetic operation {} not'
                                          ' implemented.'.format(item[0]))

    return retset
Ejemplo n.º 7
0
def _generic_index_calculation(ds: xr.Dataset,
                               var: VarName.TYPE,
                               region: PolygonLike.TYPE,
                               window: int,
                               file: str,
                               name: str,
                               threshold: float = None,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Ejemplo n.º 8
0
def _resample_array(array: xr.DataArray, lon: xr.DataArray, lat: xr.DataArray, method_us: int,
                    method_ds: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample the given xr.DataArray to a new grid defined by lat and lon

    :param array: xr.DataArray with lat,lon and time coordinates
    :param lat: 'lat' xr.DataArray attribute for the new grid
    :param lon: 'lon' xr.DataArray attribute for the new grid
    :param method_us: Interpolation method to use for upsampling, see resampling.py
    :param method_ds: Interpolation method to use for downsampling, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: The resampled array
    """
    # Determine width and height of the resampled array
    width = lon.values.size
    height = lat.values.size

    monitor = parent_monitor.child(1)

    kwargs = {'w': width, 'h': height, 'ds_method': method_ds, 'us_method': method_us, 'parent_monitor': monitor}

    groupby_list = list(array.dims)
    for dim in ['lon', 'lat']:
        groupby_list.remove(dim)

    if 0 == len(groupby_list):
        # a 2d dataset, can't do groupby => do a simple slice resample
        with monitor.starting("coregister dataarray", total_work=1):
            temp_array = _resample_slice(array, **kwargs)
            coords = {'lat': lat, 'lon': lon}
            return xr.DataArray(temp_array.values,
                                name=array.name,
                                dims=array.dims,
                                coords=coords,
                                attrs=array.attrs).chunk()

    num_steps = 1
    for dim in groupby_list:
        num_steps = num_steps * len(array[dim])

    with monitor.starting("coregister dataarray", total_work=num_steps):
        temp_array = _nested_groupby_apply(array, groupby_list, _resample_slice, kwargs)
        chunks = {'lat': height, 'lon': width}
        coords = {'lat': lat, 'lon': lon}
        for dim in groupby_list:
            coords[dim] = array[dim]
            # One spatial slice is one dask chunk, e.g. chunking is
            # (1,1,1..1,len(lat),len(lon))
            chunks[dim] = 1
        return xr.DataArray(temp_array.values,
                            name=array.name,
                            dims=array.dims,
                            coords=coords,
                            attrs=array.attrs).chunk(chunks=chunks)
Ejemplo n.º 9
0
def _resample_array(array: xr.DataArray, lon: xr.DataArray, lat: xr.DataArray, method_us: int,
                    method_ds: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample the given xr.DataArray to a new grid defined by lat and lon

    :param array: xr.DataArray with lat,lon and time coordinates
    :param lat: 'lat' xr.DataArray attribute for the new grid
    :param lon: 'lon' xr.DataArray attribute for the new grid
    :param method_us: Interpolation method to use for upsampling, see resampling.py
    :param method_ds: Interpolation method to use for downsampling, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: The resampled array
    """
    # Determine width and height of the resampled array
    width = lon.values.size
    height = lat.values.size

    monitor = parent_monitor.child(1)

    kwargs = {'w': width, 'h': height, 'ds_method': method_ds, 'us_method': method_us, 'parent_monitor': monitor}

    groupby_list = list(array.dims)
    for dim in ['lon', 'lat']:
        groupby_list.remove(dim)

    if 0 == len(groupby_list):
        # a 2d dataset, can't do groupby => do a simple slice resample
        with monitor.starting("coregister dataarray", total_work=1):
            temp_array = _resample_slice(array, **kwargs)
            coords = {'lat': lat, 'lon': lon}
            return xr.DataArray(temp_array.values,
                                name=array.name,
                                dims=array.dims,
                                coords=coords,
                                attrs=array.attrs).chunk()

    num_steps = 1
    for dim in groupby_list:
        num_steps = num_steps * len(array[dim])

    with monitor.starting("coregister dataarray", total_work=num_steps):
        temp_array = _nested_groupby_apply(array, groupby_list, _resample_slice, kwargs)
        chunks = {'lat': height, 'lon': width}
        coords = {'lat': lat, 'lon': lon}
        for dim in groupby_list:
            coords[dim] = array[dim]
            # One spatial slice is one dask chunk, e.g. chunking is
            # (1,1,1..1,len(lat),len(lon))
            chunks[dim] = 1
        return xr.DataArray(temp_array.values,
                            name=array.name,
                            dims=array.dims,
                            coords=coords,
                            attrs=array.attrs).chunk(chunks=chunks)
Ejemplo n.º 10
0
 def update_indices(self,
                    update_file_lists: bool = False,
                    monitor: Monitor = Monitor.NONE):
     with monitor.starting('Updating indices', 100):
         self._init_data_sources()
         monitor.progress(work=10 if update_file_lists else 100)
         if update_file_lists:
             child_monitor = monitor.child(work=90)
             with child_monitor.starting('Updating file lists',
                                         len(self._data_sources)):
                 for data_source in self._data_sources:
                     data_source.update_file_list()
                     child_monitor.progress(work=1)
Ejemplo n.º 11
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {
        'min': np.nanmin,
        'max': np.nanmax,
        'mean': np.nanmean,
        'median': np.nanmedian,
        'sum': np.nansum
    }

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [
            value for value in dim if value in retset[var_name].dims
        ]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Ejemplo n.º 12
0
    def add_local_data_source(self, data_source_id: str, file_path_pattern: str, monitor: Monitor):
        """
        Adds a local data source made up of the specified files.

        :param data_source_id: The identifier of the local data source.
        :param file_path_pattern: The files path containing wildcards.
        :param monitor: a progress monitor.
        :return: JSON-serializable list of 'local' data sources, sorted by name.
        """
        data_store = DATA_STORE_REGISTRY.get_data_store('local')
        if data_store is None:
            raise ValueError('Unknown data store: "%s"' % 'local')
        with monitor.starting('Adding local data source', 100):
            # TODO use monitor, while extracting metadata
            data_store.add_pattern(data_source_id=data_source_id, files=file_path_pattern)
            return self.get_data_sources('local', monitor=monitor.child(100))
Ejemplo n.º 13
0
def _resample_slice(arr_slice: xr.DataArray, w: int, h: int, ds_method: int,
                    us_method: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample a single time slice of a larger xr.DataArray

    :param arr_slice: xr.DataArray single slice
    :param w: The desired new width (amount of longitudes)
    :param h: The desired new height (amount of latitudes)
    :param ds_method: Downsampling method, see resampling.py
    :param us_method: Upsampling method, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: resampled slice
    """
    monitor = parent_monitor.child(1)
    with monitor.observing("resample slice"):
        result = resampling.resample_2d(np.ma.masked_invalid(arr_slice.values),
                                        w, h, ds_method, us_method)
        return xr.DataArray(result)
Ejemplo n.º 14
0
def tseries_mean(ds: xr.Dataset,
                 var: VarNamesLike.TYPE,
                 std_suffix: str = '_std',
                 calculate_std: bool = True,
                 monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Extract spatial mean timeseries of the provided variables, return the
    dataset that in addition to all the information in the given dataset
    contains also timeseries data for the provided variables, following
    naming convention 'var_name1_ts_mean'

    If a data variable with more dimensions than time/lat/lon is provided,
    the data will be reduced by taking the mean of all data values at a single
    time position resulting in one dimensional timeseries data variable.

    :param ds: The dataset from which to perform timeseries extraction.
    :param var: Variables for which to perform timeseries extraction
    :param calculate_std: Whether to calculate std in addition to mean
    :param std_suffix: Std suffix to use for resulting datasets, if std is calculated.
    :param monitor: a progress monitor.
    :return: Dataset with timeseries variables
    """
    if not var:
        var = '*'

    retset = select_var(ds, var)
    names = retset.data_vars.keys()

    with monitor.starting("Calculate mean", total_work=len(names)):
        for name in names:
            dims = list(ds[name].dims)
            dims.remove('time')
            with monitor.child(1).observing("Calculate mean"):
                retset[name] = retset[name].mean(dim=dims, keep_attrs=True)
            retset[name].attrs[
                'Cate_Description'] = 'Mean aggregated over {} at each point in time.'.format(
                    dims)
            std_name = name + std_suffix
            retset[std_name] = ds[name].std(dim=dims)
            retset[std_name].attrs[
                'Cate_Description'] = 'Accompanying std values for variable \'{}\''.format(
                    name)

    return retset
Ejemplo n.º 15
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [value for value in dim if value in retset[var_name].dims]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Ejemplo n.º 16
0
def _resample_slice(arr_slice: xr.DataArray, w: int, h: int, ds_method: int, us_method: int,
                    parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample a single time slice of a larger xr.DataArray

    :param arr_slice: xr.DataArray single slice
    :param w: The desired new width (amount of longitudes)
    :param h: The desired new height (amount of latitudes)
    :param ds_method: Downsampling method, see resampling.py
    :param us_method: Upsampling method, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: resampled slice
    """
    monitor = parent_monitor.child(1)
    with monitor.observing("resample slice"):
        # In some cases the grouped dimension is not automatically squeezed out
        result = resampling.resample_2d(np.ma.masked_invalid(arr_slice.squeeze().values),
                                        w,
                                        h,
                                        ds_method,
                                        us_method)
        return xr.DataArray(result)
Ejemplo n.º 17
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id
        time_range = TimeRangeLike.convert(time_range)
        region = PolygonLike.convert(region)
        var_names = VarNamesLike.convert(var_names)

        time_range, region, var_names = self._apply_make_local_fixes(
            time_range, region, var_names)

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        do_update_of_verified_time_coverage_start_once = True
        verified_time_coverage_start = None
        verified_time_coverage_end = None

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(
                self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(
                    TimeRangeLike.format(time_range))
            raise DataAccessError(msg)
        try:
            if protocol == _ODP_PROTOCOL_OPENDAP:

                do_update_of_variables_meta_info_once = True
                do_update_of_region_meta_info_once = True

                files = self._get_urls_list(selected_file_list, protocol)
                monitor.start('Sync ' + self.id, total_work=len(files))
                for idx, dataset_uri in enumerate(files):
                    child_monitor = monitor.child(work=1)

                    file_name = os.path.basename(dataset_uri)
                    local_filepath = os.path.join(local_path, file_name)

                    time_coverage_start = selected_file_list[idx][1]
                    time_coverage_end = selected_file_list[idx][2]

                    try:
                        child_monitor.start(label=file_name, total_work=1)

                        remote_dataset = xr.open_dataset(dataset_uri)

                        if var_names:
                            remote_dataset = remote_dataset.drop([
                                var_name for var_name in
                                remote_dataset.data_vars.keys()
                                if var_name not in var_names
                            ])

                        if region:
                            remote_dataset = normalize_impl(remote_dataset)
                            remote_dataset = subset_spatial_impl(
                                remote_dataset, region)
                            geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds

                            remote_dataset.attrs[
                                'geospatial_lat_min'] = geo_lat_min
                            remote_dataset.attrs[
                                'geospatial_lat_max'] = geo_lat_max
                            remote_dataset.attrs[
                                'geospatial_lon_min'] = geo_lon_min
                            remote_dataset.attrs[
                                'geospatial_lon_max'] = geo_lon_max
                            if do_update_of_region_meta_info_once:
                                local_ds.meta_info['bbox_maxx'] = geo_lon_max
                                local_ds.meta_info['bbox_minx'] = geo_lon_min
                                local_ds.meta_info['bbox_maxy'] = geo_lat_max
                                local_ds.meta_info['bbox_miny'] = geo_lat_min
                                do_update_of_region_meta_info_once = False

                        if compression_enabled:
                            for sel_var_name in remote_dataset.variables.keys(
                            ):
                                remote_dataset.variables.get(
                                    sel_var_name).encoding.update(
                                        encoding_update)

                        remote_dataset.to_netcdf(local_filepath)

                        child_monitor.progress(work=1,
                                               msg=str(time_coverage_start))
                    finally:
                        if do_update_of_variables_meta_info_once:
                            variables_info = local_ds.meta_info.get(
                                'variables', [])
                            local_ds.meta_info['variables'] = [
                                var_info for var_info in variables_info
                                if var_info.get('name') in remote_dataset.
                                variables.keys() and var_info.get(
                                    'name') not in remote_dataset.dims.keys()
                            ]
                            do_update_of_variables_meta_info_once = False

                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                        if do_update_of_verified_time_coverage_start_once:
                            verified_time_coverage_start = time_coverage_start
                            do_update_of_verified_time_coverage_start_once = False
                        verified_time_coverage_end = time_coverage_end
                    child_monitor.done()
            else:
                outdated_file_list = []
                for file_rec in selected_file_list:
                    filename, _, _, file_size, url = file_rec
                    dataset_file = os.path.join(local_path, filename)
                    # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                    # ... outdated or incomplete or corrupted.
                    # JSON also includes "checksum" and "checksum_type" fields.
                    if not os.path.isfile(dataset_file) or (
                            file_size
                            and os.path.getsize(dataset_file) != file_size):
                        outdated_file_list.append(file_rec)

                if outdated_file_list:
                    with monitor.starting('Sync ' + self.id,
                                          len(outdated_file_list)):
                        bytes_to_download = sum(
                            [file_rec[3] for file_rec in outdated_file_list])
                        dl_stat = _DownloadStatistics(bytes_to_download)

                        file_number = 1

                        for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                            dataset_file = os.path.join(local_path, filename)
                            sub_monitor = monitor.child(work=1.0)

                            # noinspection PyUnusedLocal
                            def reporthook(block_number, read_size,
                                           total_file_size):
                                dl_stat.handle_chunk(read_size)
                                sub_monitor.progress(work=read_size,
                                                     msg=str(dl_stat))

                            sub_monitor_msg = "file %d of %d" % (
                                file_number, len(outdated_file_list))
                            with sub_monitor.starting(sub_monitor_msg,
                                                      file_size):
                                urllib.request.urlretrieve(
                                    url[protocol],
                                    filename=dataset_file,
                                    reporthook=reporthook)
                            file_number += 1
                            local_ds.add_dataset(
                                os.path.join(local_id, filename),
                                (coverage_from, coverage_to))

                            if do_update_of_verified_time_coverage_start_once:
                                verified_time_coverage_start = coverage_from
                                do_update_of_verified_time_coverage_start_once = False
                            verified_time_coverage_end = coverage_to
        except OSError as e:
            raise DataAccessError(
                "Copying remote data source failed: {}".format(e),
                source=self) from e
        local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(
            verified_time_coverage_start)
        local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(
            verified_time_coverage_end)
        local_ds.save(True)
Ejemplo n.º 18
0
def plot_hovmoeller(ds: xr.Dataset,
                    var: VarName.TYPE = None,
                    x_axis: DimName.TYPE = None,
                    y_axis: DimName.TYPE = None,
                    method: str = 'mean',
                    contour: bool = True,
                    title: str = None,
                    file: str = None,
                    monitor: Monitor = Monitor.NONE,
                    **kwargs) -> Figure:
    """
    Create a Hovmoeller plot of the given dataset. Dimensions other than
    the ones defined as x and y axis will be aggregated using the given
    method to produce the plot.

    :param ds: Dataset to plot
    :param var: Name of the variable to plot
    :param x_axis: Dimension to show on x axis
    :param y_axis: Dimension to show on y axis
    :param method: Aggregation method
    :param contour: Whether to produce a contour plot
    :param title: Plot title
    :param file: path to a file in which to save the plot
    :param monitor: A progress monitor
    :param kwargs: Keyword arguments to pass to underlying xarray plotting fuction
    """
    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)
    var = ds[var_name]

    if not x_axis:
        x_axis = var.dims[0]
    else:
        x_axis = DimName.convert(x_axis)

    if not y_axis:
        try:
            y_axis = var.dims[1]
        except IndexError:
            raise ValidationError(
                'Given dataset variable should have at least two dimensions.')
    else:
        y_axis = DimName.convert(y_axis)

    if x_axis == y_axis:
        raise ValidationError('Dimensions should differ between plot axis.')

    dims = list(var.dims)
    try:
        dims.remove(x_axis)
        dims.remove(y_axis)
    except ValueError:
        raise ValidationError(
            'Given dataset variable: {} does not feature requested dimensions:\
 {}, {}.'.format(var_name, x_axis, y_axis))

    ufuncs = {
        'min': np.nanmin,
        'max': np.nanmax,
        'mean': np.nanmean,
        'median': np.nanmedian,
        'sum': np.nansum
    }

    with monitor.starting("Plot Hovmoeller", total_work=100):
        monitor.progress(5)
        with monitor.child(90).observing("Aggregate"):
            var = var.reduce(ufuncs[method], dim=dims)
        monitor.progress(5)

    figure = plt.figure()
    ax = figure.add_subplot(111)
    if x_axis == 'time':
        figure.autofmt_xdate()

    if contour:
        var.plot.contourf(ax=ax, x=x_axis, y=y_axis, **kwargs)
    else:
        var.plot.pcolormesh(ax=ax, x=x_axis, y=y_axis, **kwargs)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 19
0
def long_term_average(source: str,
                      year_min: int,
                      year_max: int,
                      file: str,
                      var: VarNamesLike.TYPE = None,
                      save: bool = False,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform the long term monthly average of the given monthly or daily data
    source for the given range of years.

    Depending on the given year range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    :param source: The data source from which to extract the monthly average
    :param year_min: The earliest year of the desired time range
    :param year_max: The most recent year of the desired time range
    :param file: filepath where to save the long term average dataset
    :param var: If given, only these variable names will be preserved in the
    output.
    :param save: If True, saves the data downloaded during this operation. This
    can potentially be a very large amount of data.
    :param monitor: A progress monitor to use
    :return: The Long Term Average dataset.
    """
    var = VarNamesLike.convert(var)

    n_years = year_max - year_min + 1
    res = 0
    total_work = 100

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query\
                         term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query\
                         term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # Check if we have a monthly data source
    fq = data_source.meta_info['time_frequency']
    if fq != 'mon':
        raise ValueError("Only monthly datasets are supported for time being.")

    with monitor.starting('LTA', total_work=total_work):
        # Set up the monitor
        monitor.progress(work=0)
        step = total_work * 0.9 / n_years

        # Process the data source year by year
        year = year_min
        while year != year_max + 1:

            tmin = "{}-01-01".format(year)
            tmax = "{}-12-31".format(year)

            # Determine if the data for the given year are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this year, we do the syncing anyway.
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough
                    break

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

            try:
                if res == 0:
                    res = ds / n_years
                else:
                    # Xarray doesn't do automatic alignment for in place
                    # operations, hence we have to do it manually
                    res = res + ds.reindex_like(res) / n_years
            except TypeError:
                raise TypeError('One or more data arrays feature a dtype that\
                                can not be divided. Consider using the var\
                                parameter to filter the dataset.')

            ds.close()
            # delete data for the current year, if it should be deleted and it
            # was not already downloaded.
            if (not save) and (not was_already_downloaded):
                data_source.delete_local(dt_range)

            monitor.progress(work=step * 0.1)

            year = year + 1

        monitor.progress(msg='Saving the LTA dataset')
        save_dataset(res, file)
        monitor.progress(total_work * 0.1)

    return res
Ejemplo n.º 20
0
def temporal_agg(source: str,
                 start_date: str = None,
                 end_date: str = None,
                 var: VarNamesLike.TYPE = None,
                 level: str = 'mon',
                 method: str = 'mean',
                 save_data: bool = False,
                 monitor: Monitor = Monitor.NONE) -> (xr.Dataset, str):
    """
    Perform temporal aggregation of the given data source to the given level
    using the given method for the given time range. Only full time periods
    of the given time range will be aggregated.

    Depending on the given time range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    The aggregation result is saved into the local data store for later reuse.

    :param source: Data source to aggregate
    :param start_date: Start date of aggregation. If not given, data source
    start date is used instead
    :param end_date: End date of aggregation. If not given, data source end
    date is used instead
    :param var: If given, only these dataset variables will be preserved in the
    result
    :param level: Aggregation level
    :param method: Aggregation method
    :param save_data: Whether to save data downloaded during this operation.
    This can potentially be a lot of data.
    :param monitor: A progress monitor to use
    :return: The local data source identifier for the aggregated data
    """
    # Raise not implemented, while not finished
    raise ValueError("Operation is not implemented.")

    var = VarNamesLike.convert(var)

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query "
                         "term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query "
                         "term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # We have to do this to have temporal coverage info in meta_info
    data_source._init_file_list()

    # Check if the data source temporal resolution is known
    known_res = ('day', '8-days', 'mon', 'yr')

    fq = data_source.meta_info['time_frequency']
    if (not fq) or (fq not in known_res):
        raise ValueError("The given data source features unknown time "
                         "resolution: {}".format(fq))

    # Check if the operation supports the desired aggregation step
    valid_steps = list()
    valid_steps.append(('day', 'mon'))
    if (fq, level) not in valid_steps:
        raise ValueError("Currently the operation does not support aggregation"
                         " from {} to {}".format(fq, level))

    # Determine start and end dates
    if not start_date:
        start_date = data_source.meta_info['temporal_coverage_start']
    start_date = to_datetime(start_date)
    # If start_date is not start of the month, move it to the 1st of next
    # month
    if start_date.day != 1:
        try:
            start_date = datetime(start_date.year, start_date.month + 1, 1)
        except ValueError:
            # We have tried to set the month to 13
            start_date = datetime(start_date.year + 1, 1, 1)

    if not end_date:
        end_date = data_source.meta_info['temporal_coverage_end']
    end_date = to_datetime(end_date)
    # If end date is not end of the month, move it to the last day of the
    # previous month
    if not _is_end_of_month(end_date):
        try:
            end_date = datetime(end_date.year, end_date.month - 1, 27)
        except ValueError:
            # We have tried to set the month to 0
            end_date = datetime(end_date.year - 1, 12, 31)

    end_date = _end_of_month(end_date.year, end_date.month)

    # Determine the count of processing periods
    n_periods = (end_date.year - start_date.year + 1) * 12\
        + end_date.month - start_date.month - 11
    # 2000-4-1, 2000-6-30 -> 12 + 2 -11 = 3

    if n_periods < 1:
        raise ValueError("The given time range does not contain any full "
                         "calendar months to do aggregation with.")

    # Set up the monitor
    total_work = 100
    with monitor.starting('Aggregate', total_work=total_work):
        monitor.progress(work=0)
        step = total_work * 0.9 / n_periods

        # Process the data source period by period
        tmin = start_date
        while tmin < end_date:
            tmax = _end_of_month(tmin.year, tmin.month)

            # Determine if the data for the given period are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this period, we do the syncing anyway
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough
                    break

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

            # Do the aggregation

            # Save the dataset for this period into local data store

            # Close and delete the files if needed
            ds.close()
            # delete data for the current period,if it should be deleted and it
            # was not already downloaded.
            if (not save_data) and (not was_already_downloaded):
                data_source.delete_local(dt_range)

            monitor.progress(work=step * 0.1)

            # tmin for next iteration
            try:
                tmin = datetime(tmin.year, tmin.month + 1, 1)
            except ValueError:
                # Couldn't add a month -> end of year
                tmin = datetime(tmin.year + 1, 1, 1)
            pass

    monitor.progress(work=step * 0.1)

    # Return the local data source id
    return None
Ejemplo n.º 21
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        try:
                            remote_dataset = xr.open_dataset(
                                remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.data_vars.keys()
                                    if var_name not in var_names
                                ])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = subset_spatial_impl(
                                    remote_dataset, region)
                                geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds

                                remote_dataset.attrs[
                                    'geospatial_lat_min'] = geo_lat_min
                                remote_dataset.attrs[
                                    'geospatial_lat_max'] = geo_lat_max
                                remote_dataset.attrs[
                                    'geospatial_lon_min'] = geo_lon_min
                                remote_dataset.attrs[
                                    'geospatial_lon_max'] = geo_lon_max
                                if do_update_of_region_meta_info_once:
                                    local_ds.meta_info[
                                        'bbox_maxx'] = geo_lon_max
                                    local_ds.meta_info[
                                        'bbox_minx'] = geo_lon_min
                                    local_ds.meta_info[
                                        'bbox_maxy'] = geo_lat_max
                                    local_ds.meta_info[
                                        'bbox_miny'] = geo_lat_min
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys(
                                ):
                                    remote_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(
                                work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once:
                                variables_info = local_ds.meta_info.get(
                                    'variables', [])
                                local_ds.meta_info['variables'] = [
                                    var_info for var_info in variables_info
                                    if var_info.get('name') in remote_dataset.
                                    variables.keys() and var_info.get('name')
                                    not in remote_dataset.dims.keys()
                                ]
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(
                                os.path.join(local_id, file_name),
                                (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Ejemplo n.º 22
0
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset:
    """
    Calculate Pearson correlation coefficients and p-values for testing
    non-correlation of lon/lat/time xarray datasets for each lon/lat point.

    Heavily influenced by scipy.stats.pearsonr

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed, and not necessarily zero-mean.
    Like other correlation coefficients, this one varies between -1 and +1
    with 0 implying no correlation. Correlations of -1 or +1 imply an exact
    linear relationship. Positive correlations imply that as x increases, so
    does y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    :param x: lon/lat/time xr.DataArray
    :param y: xr.DataArray of the same spatiotemporal extents and resolution as x.
    :param monitor: Monitor to use for monitoring the calculation
    :return: A dataset containing the correlation coefficients and p_values on
    the lon/lat grid of x and y.

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
    """
    with monitor.starting("Calculate Pearson correlation", total_work=6):
        n = len(x['time'])

        xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
        xm.time.values = [i for i in range(0, len(xm.time))]
        ym.time.values = [i for i in range(0, len(ym.time))]
        xm_ym = xm * ym
        r_num = xm_ym.sum(dim='time')
        xm_squared = np.square(xm)
        ym_squared = np.square(ym)
        r_den = np.sqrt(xm_squared.sum(dim='time') * ym_squared.sum(dim='time'))
        r_den = r_den.where(r_den != 0)
        r = r_num / r_den

        # Presumably, if abs(r) > 1, then it is only some small artifact of floating
        # point arithmetic.
        # At this point r should be a lon/lat dataArray, so it should be safe to
        # load it in memory explicitly. This may take time as it will kick-start
        # deferred processing.
        # Comparing with NaN produces warnings that can be safely ignored
        default_warning_settings = np.seterr(invalid='ignore')
        with monitor.child(1).observing("task 1"):
            negativ_r = r.values < -1.0
        with monitor.child(1).observing("task 2"):
            r.values[negativ_r] = -1.0
        with monitor.child(1).observing("task 3"):
            positiv_r = r.values > 1.0
        with monitor.child(1).observing("task 4"):
            r.values[positiv_r] = 1.0
        np.seterr(**default_warning_settings)
        r.attrs = {'description': 'Correlation coefficients between'
                   ' {} and {}.'.format(x.name, y.name)}

        df = n - 2
        t_squared = np.square(r) * (df / ((1.0 - r.where(r != 1)) * (1.0 + r.where(r != -1))))

        prob = df / (df + t_squared)
        with monitor.child(1).observing("task 5"):
            prob_values_in = prob.values
        with monitor.child(1).observing("task 6"):
            prob.values = betainc(0.5 * df, 0.5, prob_values_in)
        prob.attrs = {'description': 'Rough indicator of probability of an'
                      ' uncorrelated system producing datasets that have a Pearson'
                      ' correlation at least as extreme as the one computed from'
                      ' these datsets. Not entirely reliable, but reasonable for'
                      ' datasets larger than 500 or so.'}

        retset = xr.Dataset({'corr_coef': r,
                             'p_value': prob})
    return retset
Ejemplo n.º 23
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)

        if protocol == _ODP_PROTOCOL_OPENDAP:

            files = self._get_urls_list(selected_file_list, protocol)
            monitor.start('Sync ' + self.name, total_work=len(files))
            for idx, dataset_uri in enumerate(files):
                child_monitor = monitor.child(work=1)

                file_name = os.path.basename(dataset_uri)
                local_filepath = os.path.join(local_path, file_name)

                time_coverage_start = selected_file_list[idx][1]
                time_coverage_end = selected_file_list[idx][2]

                remote_netcdf = None
                local_netcdf = None
                try:
                    remote_netcdf = NetCDF4DataStore(dataset_uri)

                    local_netcdf = NetCDF4DataStore(local_filepath,
                                                    mode='w',
                                                    persist=True)
                    local_netcdf.set_attributes(remote_netcdf.get_attrs())

                    remote_dataset = xr.Dataset.load_store(remote_netcdf)

                    process_region = False
                    if region:
                        geo_lat_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_min')
                        geo_lat_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_max')
                        geo_lon_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_min')
                        geo_lon_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_max')

                        geo_lat_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_resolution')
                        geo_lon_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_resolution')
                        if not (isnan(geo_lat_min) or isnan(geo_lat_max)
                                or isnan(geo_lon_min) or isnan(geo_lon_max)
                                or isnan(geo_lat_res) or isnan(geo_lon_res)):
                            process_region = True

                            [lat_min, lon_min, lat_max,
                             lon_max] = region.bounds

                            lat_min = floor(
                                (lat_min - geo_lat_min) / geo_lat_res)
                            lat_max = ceil(
                                (lat_max - geo_lat_min) / geo_lat_res)
                            lon_min = floor(
                                (lon_min - geo_lon_min) / geo_lon_res)
                            lon_max = ceil(
                                (lon_max - geo_lon_min) / geo_lon_res)

                            # TODO (kbernat): check why dataset.sel fails!
                            remote_dataset = remote_dataset.isel(
                                drop=False,
                                lat=slice(lat_min, lat_max),
                                lon=slice(lon_min, lon_max))

                            geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                            geo_lat_min += lat_min * geo_lat_res
                            geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                            geo_lon_min += lon_min * geo_lon_res

                    if not var_names:
                        var_names = [
                            var_name
                            for var_name in remote_netcdf.variables.keys()
                        ]
                    var_names.extend([
                        coord_name
                        for coord_name in remote_dataset.coords.keys()
                        if coord_name not in var_names
                    ])
                    child_monitor.start(label=file_name,
                                        total_work=len(var_names))
                    for sel_var_name in var_names:
                        var_dataset = remote_dataset.drop([
                            var_name
                            for var_name in remote_dataset.variables.keys()
                            if var_name != sel_var_name
                        ])
                        if compression_enabled:
                            var_dataset.variables.get(
                                sel_var_name).encoding.update(encoding_update)
                        local_netcdf.store_dataset(var_dataset)
                        child_monitor.progress(work=1, msg=sel_var_name)
                    if process_region:
                        local_netcdf.set_attribute('geospatial_lat_min',
                                                   geo_lat_min)
                        local_netcdf.set_attribute('geospatial_lat_max',
                                                   geo_lat_max)
                        local_netcdf.set_attribute('geospatial_lon_min',
                                                   geo_lon_min)
                        local_netcdf.set_attribute('geospatial_lon_max',
                                                   geo_lon_max)

                finally:
                    if remote_netcdf:
                        remote_netcdf.close()
                    if local_netcdf:
                        local_netcdf.close()
                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                child_monitor.done()
        else:
            outdated_file_list = []
            for file_rec in selected_file_list:
                filename, _, _, file_size, url = file_rec
                dataset_file = os.path.join(local_path, filename)
                # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                # ... outdated or incomplete or corrupted.
                # JSON also includes "checksum" and "checksum_type" fields.
                if not os.path.isfile(dataset_file) or (
                        file_size
                        and os.path.getsize(dataset_file) != file_size):
                    outdated_file_list.append(file_rec)

            if outdated_file_list:
                with monitor.starting('Sync ' + self.name,
                                      len(outdated_file_list)):
                    bytes_to_download = sum(
                        [file_rec[3] for file_rec in outdated_file_list])
                    dl_stat = _DownloadStatistics(bytes_to_download)

                    file_number = 1

                    for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                        if monitor.is_cancelled():
                            raise InterruptedError
                        dataset_file = os.path.join(local_path, filename)
                        sub_monitor = monitor.child(work=1.0)

                        # noinspection PyUnusedLocal
                        def reporthook(block_number, read_size,
                                       total_file_size):
                            dl_stat.handle_chunk(read_size)
                            if monitor.is_cancelled():
                                raise InterruptedError
                            sub_monitor.progress(work=read_size,
                                                 msg=str(dl_stat))

                        sub_monitor_msg = "file %d of %d" % (
                            file_number, len(outdated_file_list))
                        with sub_monitor.starting(sub_monitor_msg, file_size):
                            urllib.request.urlretrieve(url[protocol],
                                                       filename=dataset_file,
                                                       reporthook=reporthook)
                        file_number += 1
                        local_ds.add_dataset(os.path.join(local_id, filename),
                                             (coverage_from, coverage_to))
        local_ds.save()
        monitor.done()
Ejemplo n.º 24
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.name, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                remote_netcdf = None
                local_netcdf = None
                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:
                        try:
                            remote_netcdf = NetCDF4DataStore(
                                remote_absolute_filepath)

                            local_netcdf = NetCDF4DataStore(
                                local_absolute_filepath,
                                mode='w',
                                persist=True)
                            local_netcdf.set_attributes(
                                remote_netcdf.get_attrs())

                            remote_dataset = xr.Dataset.load_store(
                                remote_netcdf)

                            process_region = False
                            if region:
                                geo_lat_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_min')
                                geo_lat_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_max')
                                geo_lon_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_min')
                                geo_lon_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_max')

                                geo_lat_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lon_resolution')
                                geo_lon_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lat_resolution')
                                if not (isnan(geo_lat_min)
                                        or isnan(geo_lat_max)
                                        or isnan(geo_lon_min)
                                        or isnan(geo_lon_max)
                                        or isnan(geo_lat_res)
                                        or isnan(geo_lon_res)):
                                    process_region = True

                                    [lat_min, lon_min, lat_max,
                                     lon_max] = region.bounds

                                    lat_min = floor(
                                        (lat_min - geo_lat_min) / geo_lat_res)
                                    lat_max = ceil(
                                        (lat_max - geo_lat_min) / geo_lat_res)
                                    lon_min = floor(
                                        (lon_min - geo_lon_min) / geo_lon_res)
                                    lon_max = ceil(
                                        (lon_max - geo_lon_min) / geo_lon_res)

                                    # TODO (kbernat): check why dataset.sel fails!
                                    remote_dataset = remote_dataset.isel(
                                        drop=False,
                                        lat=slice(lat_min, lat_max),
                                        lon=slice(lon_min, lon_max))

                                    geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                                    geo_lat_min += lat_min * geo_lat_res
                                    geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                                    geo_lon_min += lon_min * geo_lon_res

                            if not var_names:
                                var_names = [
                                    var_name for var_name in
                                    remote_netcdf.variables.keys()
                                ]
                            var_names.extend([
                                coord_name
                                for coord_name in remote_dataset.coords.keys()
                                if coord_name not in var_names
                            ])
                            child_monitor.start(label=file_name,
                                                total_work=len(var_names))
                            for sel_var_name in var_names:
                                var_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.variables.keys()
                                    if var_name != sel_var_name
                                ])
                                if compression_enabled:
                                    var_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)
                                local_netcdf.store_dataset(var_dataset)
                                child_monitor.progress(work=1,
                                                       msg=sel_var_name)
                            if process_region:
                                local_netcdf.set_attribute(
                                    'geospatial_lat_min', geo_lat_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lat_max', geo_lat_max)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_min', geo_lon_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_max', geo_lon_max)
                        finally:
                            if remote_netcdf:
                                remote_netcdf.close()
                            if local_netcdf:
                                local_netcdf.close()
                                local_ds.add_dataset(
                                    local_relative_filepath,
                                    (time_coverage_start, time_coverage_end))
                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Ejemplo n.º 25
0
def _pearsonr(x: xr.DataArray, y: xr.DataArray,
              monitor: Monitor) -> xr.Dataset:
    """
    Calculate Pearson correlation coefficients and p-values for testing
    non-correlation of lon/lat/time xarray datasets for each lon/lat point.

    Heavily influenced by scipy.stats.pearsonr

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed, and not necessarily zero-mean.
    Like other correlation coefficients, this one varies between -1 and +1
    with 0 implying no correlation. Correlations of -1 or +1 imply an exact
    linear relationship. Positive correlations imply that as x increases, so
    does y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    :param x: lon/lat/time xr.DataArray
    :param y: xr.DataArray of the same spatiotemporal extents and resolution as x.
    :param monitor: Monitor to use for monitoring the calculation
    :return: A dataset containing the correlation coefficients and p_values on
    the lon/lat grid of x and y.

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
    """
    with monitor.starting("Calculate Pearson correlation", total_work=6):
        n = len(x['time'])

        xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
        xm['time'] = [i for i in range(0, len(xm.time))]
        ym['time'] = [i for i in range(0, len(ym.time))]
        xm_ym = xm * ym
        r_num = xm_ym.sum(dim='time')
        xm_squared = np.square(xm)
        ym_squared = np.square(ym)
        r_den = np.sqrt(
            xm_squared.sum(dim='time') * ym_squared.sum(dim='time'))
        r_den = r_den.where(r_den != 0)
        r = r_num / r_den

        # Presumably, if abs(r) > 1, then it is only some small artifact of floating
        # point arithmetic.
        # At this point r should be a lon/lat dataArray, so it should be safe to
        # load it in memory explicitly. This may take time as it will kick-start
        # deferred processing.
        # Comparing with NaN produces warnings that can be safely ignored
        default_warning_settings = np.seterr(invalid='ignore')
        with monitor.child(1).observing("task 1"):
            negativ_r = r.values < -1.0
        with monitor.child(1).observing("task 2"):
            r.values[negativ_r] = -1.0
        with monitor.child(1).observing("task 3"):
            positiv_r = r.values > 1.0
        with monitor.child(1).observing("task 4"):
            r.values[positiv_r] = 1.0
        np.seterr(**default_warning_settings)
        r.attrs = {
            'description':
            'Correlation coefficients between'
            ' {} and {}.'.format(x.name, y.name)
        }

        df = n - 2
        t_squared = np.square(r) * (df / ((1.0 - r.where(r != 1)) *
                                          (1.0 + r.where(r != -1))))

        prob = df / (df + t_squared)
        with monitor.child(1).observing("task 5"):
            prob_values_in = prob.values
        with monitor.child(1).observing("task 6"):
            prob.values = betainc(0.5 * df, 0.5, prob_values_in)
        prob.attrs = {
            'description':
            'Rough indicator of probability of an'
            ' uncorrelated system producing datasets that have a Pearson'
            ' correlation at least as extreme as the one computed from'
            ' these datsets. Not entirely reliable, but reasonable for'
            ' datasets larger than 500 or so.'
        }

        retset = xr.Dataset({'corr_coef': r, 'p_value': prob})
    return retset
Ejemplo n.º 26
0
def plot_hovmoeller(ds: xr.Dataset,
                    var: VarName.TYPE = None,
                    x_axis: DimName.TYPE = None,
                    y_axis: DimName.TYPE = None,
                    method: str = 'mean',
                    contour: bool = True,
                    title: str = None,
                    file: str = None,
                    monitor: Monitor = Monitor.NONE,
                    **kwargs) -> Figure:
    """
    Create a Hovmoeller plot of the given dataset. Dimensions other than
    the ones defined as x and y axis will be aggregated using the given
    method to produce the plot.

    :param ds: Dataset to plot
    :param var: Name of the variable to plot
    :param x_axis: Dimension to show on x axis
    :param y_axis: Dimension to show on y axis
    :param method: Aggregation method
    :param contour: Whether to produce a contour plot
    :param title: Plot title
    :param file: path to a file in which to save the plot
    :param monitor: A progress monitor
    :param kwargs: Keyword arguments to pass to underlying xarray plotting fuction
    """
    var_name = None
    if not var:
        for key in ds.data_vars.keys():
            var_name = key
            break
    else:
        var_name = VarName.convert(var)
    var = ds[var_name]

    if not x_axis:
        x_axis = var.dims[0]
    else:
        x_axis = DimName.convert(x_axis)

    if not y_axis:
        try:
            y_axis = var.dims[1]
        except IndexError:
            raise ValidationError('Given dataset variable should have at least two dimensions.')
    else:
        y_axis = DimName.convert(y_axis)

    if x_axis == y_axis:
        raise ValidationError('Dimensions should differ between plot axis.')

    dims = list(var.dims)
    try:
        dims.remove(x_axis)
        dims.remove(y_axis)
    except ValueError:
        raise ValidationError('Given dataset variable: {} does not feature requested dimensions:\
 {}, {}.'.format(var_name, x_axis, y_axis))

    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    with monitor.starting("Plot Hovmoeller", total_work=100):
        monitor.progress(5)
        with monitor.child(90).observing("Aggregate"):
            var = var.reduce(ufuncs[method], dim=dims)
        monitor.progress(5)

    figure = plt.figure()
    ax = figure.add_subplot(111)
    if x_axis == 'time':
        figure.autofmt_xdate()

    if contour:
        var.plot.contourf(ax=ax, x=x_axis, y=y_axis, **kwargs)
    else:
        var.plot.pcolormesh(ax=ax, x=x_axis, y=y_axis, **kwargs)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Ejemplo n.º 27
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low)
                                             & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Ejemplo n.º 28
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Ejemplo n.º 29
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath)

            remote_absolute_filepath = os.path.join(self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        remote_dataset = None
                        try:
                            remote_dataset = xr.open_dataset(remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop(
                                    [var_name for var_name in remote_dataset.data_vars.keys()
                                     if var_name not in var_names])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                           allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                                    local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                                    local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                                    local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys():
                                    remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once and remote_dataset is not None:
                                variables_info = local_ds.meta_info.get('variables', [])
                                local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                                   if var_info.get('name')
                                                                   in remote_dataset.variables.keys()
                                                                   and var_info.get('name')
                                                                   not in remote_dataset.dims.keys()]
                                # noinspection PyUnusedLocal
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(os.path.join(local_id, file_name),
                                                 (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath, local_absolute_filepath)
                        local_ds.add_dataset(local_relative_filepath, (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id