Example #1
0
    def sync(self,
             time_range: Tuple[datetime, datetime] = None,
             protocol: str = None,
             monitor: Monitor = Monitor.NONE) -> Tuple[int, int]:
        # TODO (kbernat, 20161221): remove remote_url validation, there is no public interface to modify it
        assert self._file_set_data_store.remote_url
        url = urllib.parse.urlparse(self._file_set_data_store.remote_url)
        if url.scheme != 'ftp':
            raise ValueError("invalid remote URL: cannot deal with scheme %s" %
                             repr(url.scheme))
        ftp_host_name = url.hostname
        ftp_base_dir = url.path

        expected_remote_files = self._get_expected_remote_files(time_range)
        if len(expected_remote_files) == 0:
            return 0, 0

        num_of_synchronised_files = 0
        num_of_expected_remote_files = len(
            list(chain.from_iterable(list(expected_remote_files.values()))))
        with monitor.starting('Sync %s' % self._name,
                              num_of_expected_remote_files):
            try:
                with ftplib.FTP(ftp_host_name) as ftp:
                    ftp.login()
                    num_of_synchronised_files = self._sync_files(
                        ftp, ftp_base_dir, expected_remote_files,
                        num_of_expected_remote_files, monitor)
            except ftplib.Error as ftp_err:
                if not monitor.is_cancelled():
                    print('FTP error: %s' % ftp_err)

        return num_of_synchronised_files, num_of_expected_remote_files
Example #2
0
    def add_local_datasource(self, data_source_name: str, filePathPattern: str,
                             monitor: Monitor):
        """
        Adds a local datas ource made up of the specified files.

        :param data_source_name: The name of the local data source.
        :param filePattern: The files path containing wildcards.
        :return: JSON-serializable list of 'local' data sources, sorted by name.
        """
        data_store = DATA_STORE_REGISTRY.get_data_store('local')
        if data_store is None:
            raise ValueError('Unknown data store: "%s"' % 'local')
        with monitor.starting('Making data source local', 100):
            data_store.add_pattern(
                data_source_name,
                filePathPattern)  # TODO use monitor, while extracting metadata
            return self.get_data_sources('local', monitor=monitor.child(100))
Example #3
0
    def make_ds_local(self,
                      data_source_name: str,
                      local_name: str,
                      args: dict,
                      monitor: Monitor = Monitor.NONE) -> list:
        """
        Turns a (likely remote) data source into a local data source given a name and a number of
        optional constraints.

        :param data_source_name: The name of the source data source.
        :param local_name: A human readable name for the new local data source.
        :param args: A dict containing the constraints
        :param monitor: a progress monitor.
        :return: JSON-serializable list of 'local' data sources, sorted by name.
        """
        with monitor.starting('Making data source local', 100):
            data_sources = query_data_sources(name=data_source_name)
            if not data_sources:
                raise ValueError('data source "%s" not found' %
                                 data_source_name)
            if len(data_sources) > 1:
                raise ValueError(
                    'Multiple data sources with the name "%s" found' %
                    data_source_name)
            data_source = data_sources[0]

            time_range = None
            if 'start_date' in args and 'end_date' in args:
                time_range = (args['start_date'], args['end_date'])
            region = None
            if 'region' in args:
                region = args['region']
            var_names = None
            if 'var' in args:
                var_names = args['var']

            local_data_source = data_source.make_local(
                local_name=local_name,
                time_range=time_range,
                region=region,
                var_names=var_names,
                monitor=monitor.child(98))

            print('local_data_source', local_data_source)
            return self.get_data_sources('local', monitor=monitor.child(2))
Example #4
0
def _generic_index_calculation(
        ds: xr.Dataset,
        var: VarName.TYPE,
        region: PolygonLike.TYPE,
        window: int,
        file: str,
        name: str,
        threshold: float = None,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    A generic index calculation. Where an index is defined as an anomaly
    against the given reference of a moving average of the given window size of
    the given given region of the given variable of the given dataset.

    :param ds: Dataset from which to calculate the index
    :param var: Variable from which to calculate index
    :param region: Spatial subset from which to calculate the index
    :param window: Window size for the moving average
    :param file: Path to the reference file
    :param threshold: Absolute threshold that indicates an ENSO event
    :param name: Name of the index
    :param monitor: a progress monitor.
    :return: A dataset that contains the index timeseries
    """
    var = VarName.convert(var)
    region = PolygonLike.convert(region)

    with monitor.starting("Calculate the index", total_work=2):
        ds = select_var(ds, var)
        ds_subset = subset_spatial(ds, region)
        anom = anomaly_external(ds_subset, file, monitor=monitor.child(1))
        with monitor.child(1).observing("Calculate mean"):
            ts = anom.mean(dim=['lat', 'lon'])
        df = pd.DataFrame(data=ts[var].values, columns=[name], index=ts.time)
        retval = df.rolling(window=window, center=True).mean().dropna()

    if threshold is None:
        return retval

    retval['El Nino'] = pd.Series((retval[name] > threshold),
                                  index=retval.index)
    retval['La Nina'] = pd.Series((retval[name] < -threshold),
                                  index=retval.index)
    return retval
Example #5
0
def _resample_array(array: xr.DataArray, lon: xr.DataArray, lat: xr.DataArray, method_us: int,
                    method_ds: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample the given xr.DataArray to a new grid defined by lat and lon

    :param array: xr.DataArray with lat,lon and time coordinates
    :param lat: 'lat' xr.DataArray attribute for the new grid
    :param lon: 'lon' xr.DataArray attribute for the new grid
    :param method_us: Interpolation method to use for upsampling, see resampling.py
    :param method_ds: Interpolation method to use for downsampling, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: The resampled array
    """
    # Determine width and height of the resampled array
    width = lon.values.size
    height = lat.values.size

    monitor = parent_monitor.child(1)

    kwargs = {'w': width, 'h': height, 'ds_method': method_ds, 'us_method': method_us, 'parent_monitor': monitor}

    groupby_list = list(array.dims)
    for dim in ['lon', 'lat']:
        groupby_list.remove(dim)

    if 0 == len(groupby_list):
        # a 2d dataset, can't do groupby => do a simple slice resample
        with monitor.starting("coregister dataarray", total_work=1):
            temp_array = _resample_slice(array, **kwargs)
            coords = {'lat': lat, 'lon': lon}
            return xr.DataArray(temp_array.values,
                                name=array.name,
                                dims=array.dims,
                                coords=coords,
                                attrs=array.attrs).chunk()

    num_steps = 1
    for dim in groupby_list:
        num_steps = num_steps * len(array[dim])

    with monitor.starting("coregister dataarray", total_work=num_steps):
        temp_array = _nested_groupby_apply(array, groupby_list, _resample_slice, kwargs)
        chunks = {'lat': height, 'lon': width}
        coords = {'lat': lat, 'lon': lon}
        for dim in groupby_list:
            coords[dim] = array[dim]
            # One spatial slice is one dask chunk, e.g. chunking is
            # (1,1,1..1,len(lat),len(lon))
            chunks[dim] = 1
        return xr.DataArray(temp_array.values,
                            name=array.name,
                            dims=array.dims,
                            coords=coords,
                            attrs=array.attrs).chunk(chunks=chunks)
Example #6
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two timeseries and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)):
        raise ValueError('To calculate simple correlation, both provided'
                         ' datasets should be simple 1d timeseries. To'
                         ' create a map of correlation coefficients, use'
                         ' pearson_correlation operation instead.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValueError('The length of the time dimension differs between'
                         ' the given datasets. Can not perform the calculation'
                         ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValueError('The length of the time dimension should not be less'
                         ' than three to run the calculation.')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.values, array_y.values)

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Example #7
0
def no_op(num_steps: int = 10,
          step_duration: float = 0.5,
          fail_before: bool = False,
          fail_after: bool = False,
          monitor: Monitor = Monitor.NONE) -> bool:
    """
    An operation that basically does nothing but spending configurable time.
    It may be useful for testing purposes.

    :param num_steps: Number of steps to iterate.
    :param step_duration: How much time to spend in each step in seconds.
    :param fail_before: If the operation should fail before spending time doing nothing.
    :param fail_after: If the operation should fail after spending time doing nothing.
    :param monitor: A progress monitor.
    :return: Always True
    """
    import time
    monitor.start('Computing nothing', num_steps)
    if fail_before:
        raise ValueError('Intentionally failed before doing anything.')
    for i in range(num_steps):
        time.sleep(step_duration)
        monitor.progress(1.0,
                         'Step %s of %s doing nothing' % (i + 1, num_steps))
    if fail_after:
        raise ValueError('Intentionally failed after doing nothing.')
    monitor.done()
    return True
Example #8
0
def tseries_mean(ds: xr.Dataset,
                 var: VarNamesLike.TYPE,
                 std_suffix: str = '_std',
                 calculate_std: bool = True,
                 monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Extract spatial mean timeseries of the provided variables, return the
    dataset that in addition to all the information in the given dataset
    contains also timeseries data for the provided variables, following
    naming convention 'var_name1_ts_mean'

    If a data variable with more dimensions than time/lat/lon is provided,
    the data will be reduced by taking the mean of all data values at a single
    time position resulting in one dimensional timeseries data variable.

    :param ds: The dataset from which to perform timeseries extraction.
    :param var: Variables for which to perform timeseries extraction
    :param calculate_std: Whether to calculate std in addition to mean
    :param std_suffix: Std suffix to use for resulting datasets, if std is calculated.
    :param monitor: a progress monitor.
    :return: Dataset with timeseries variables
    """
    if not var:
        var = '*'

    retset = select_var(ds, var)
    names = retset.data_vars.keys()

    with monitor.starting("Calculate mean", total_work=len(names)):
        for name in names:
            dims = list(ds[name].dims)
            dims.remove('time')
            with monitor.child(1).observing("Calculate mean"):
                retset[name] = retset[name].mean(dim=dims, keep_attrs=True)
            retset[name].attrs['Cate_Description'] = 'Mean aggregated over {} at each point in time.'.format(dims)
            std_name = name + std_suffix
            retset[std_name] = ds[name].std(dim=dims)
            retset[std_name].attrs['Cate_Description'] = 'Accompanying std values for variable \'{}\''.format(name)

    return retset
Example #9
0
def save_dataset(ds: xr.Dataset,
                 file: str,
                 format: str = None,
                 monitor: Monitor = Monitor.NONE):
    """
    Save a dataset to NetCDF file.

    :param ds: The dataset
    :param file: File path
    :param format: NetCDF format flavour, one of 'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'.
    :param monitor: a progress monitor.
    """
    with monitor.observing("save_dataset"):
        ds.to_netcdf(file, format=format)
Example #10
0
def _resample_slice(arr_slice: xr.DataArray, w: int, h: int, ds_method: int, us_method: int, parent_monitor: Monitor) -> xr.DataArray:
    """
    Resample a single time slice of a larger xr.DataArray

    :param arr_slice: xr.DataArray single slice
    :param w: The desired new width (amount of longitudes)
    :param h: The desired new height (amount of latitudes)
    :param ds_method: Downsampling method, see resampling.py
    :param us_method: Upsampling method, see resampling.py
    :param parent_monitor: the parent progress monitor.
    :return: resampled slice
    """
    monitor = parent_monitor.child(1)
    with monitor.observing("resample slice"):
        result = resampling.resample_2d(np.ma.masked_invalid(arr_slice.values),
                                        w,
                                        h,
                                        ds_method,
                                        us_method)
        return xr.DataArray(result)
Example #11
0
def _resample_dataset(ds_master: xr.Dataset, ds_slave: xr.Dataset, method_us: int, method_ds: int, monitor: Monitor) -> xr.Dataset:
    """
    Resample slave onto the grid of the master.
    This does spatial resampling the whole dataset, e.g., all
    variables in the slave dataset.
    This method works only if both datasets have (time, lat, lon) dimensions.

    Note that dataset attributes are not propagated due to currently undecided CDM attributes' set.

    :param ds_master: xr.Dataset whose lat/lon coordinates are used as the resampling grid
    :param ds_slave: xr.Dataset that will be resampled on the masters' grid
    :param method_us: Interpolation method for upsampling, see resampling.py
    :param method_ds: Interpolation method for downsampling, see resampling.py
    :param monitor: a progress monitor.
    :return: xr.Dataset The resampled slave dataset
    """
    # Find lat/lon bounds of the intersection of master and slave grids. The
    # bounds should fall on pixel boundaries for both spatial dimensions for
    # both datasets
    lat_min, lat_max = _find_intersection(ds_master['lat'].values,
                                          ds_slave['lat'].values,
                                          global_bounds=(-90, 90))
    lon_min, lon_max = _find_intersection(ds_master['lon'].values,
                                          ds_slave['lon'].values,
                                          global_bounds=(-180, 180))

    # Subset slave dataset and master grid. We're not using here the subset
    # operation, because the subset operation may produce datasets that cross
    # the anti-meridian by design. However, such a disjoint dataset can not be
    # resampled using our current resampling methods.
    lat_slice = slice(lat_min, lat_max)
    lon_slice = slice(lon_min, lon_max)

    lon = ds_master['lon'].sel(lon=lon_slice)
    lat = ds_master['lat'].sel(lat=lat_slice)

    with monitor.starting("coregister dataset", len(ds_slave.data_vars)):
        kwargs = {'lon': lon, 'lat': lat, 'method_us': method_us, 'method_ds': method_ds, 'parent_monitor': monitor}
        retset = ds_slave.apply(_resample_array, keep_attrs=True, **kwargs)

    return adjust_spatial_attrs(retset)
Example #12
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files,
                    num_of_expected_remote_files, monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items(
        ):
            if monitor.is_cancelled():
                return
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    return
                if facts.get(
                        'type', None
                ) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size,
                                              path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items(
            ):
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    return
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(
                    ftp, existing_filename, existing_file_info,
                    self._file_set_data_store.root_dir,
                    (checked_files_number, num_of_expected_remote_files),
                    child_monitor, dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Example #13
0
def _pearsonr(x: xr.DataArray, y: xr.DataArray,
              monitor: Monitor) -> xr.Dataset:
    """
    Calculates Pearson correlation coefficients and p-values for testing
    non-correlation of lon/lat/time xarray datasets for each lon/lat point.

    Heavily influenced by scipy.stats.pearsonr

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed, and not necessarily zero-mean.
    Like other correlation coefficients, this one varies between -1 and +1
    with 0 implying no correlation. Correlations of -1 or +1 imply an exact
    linear relationship. Positive correlations imply that as x increases, so
    does y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    :param x: lon/lat/time xr.DataArray
    :param y: xr.DataArray of the same spatiotemporal extents and resolution as x.
    :param monitor: Monitor to use for monitoring the calculation
    :return: A dataset containing the correlation coefficients and p_values on
    the lon/lat grid of x and y.

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
    """
    with monitor.starting("Calculate Pearson correlation", total_work=6):
        n = len(x['time'])

        xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
        xm_ym = xm * ym
        r_num = xm_ym.sum(dim='time')
        xm_squared = xr.ufuncs.square(xm)
        ym_squared = xr.ufuncs.square(ym)
        r_den = xr.ufuncs.sqrt(
            xm_squared.sum(dim='time') * ym_squared.sum(dim='time'))
        r_den = r_den.where(r_den != 0)
        r = r_num / r_den

        # Presumably, if abs(r) > 1, then it is only some small artifact of floating
        # point arithmetic.
        # At this point r should be a lon/lat dataArray, so it should be safe to
        # load it in memory explicitly. This may take time as it will kick-start
        # deferred processing.
        # Comparing with NaN produces warnings that can be safely ignored
        default_warning_settings = np.seterr(invalid='ignore')
        with monitor.child(1).observing("task 1"):
            negativ_r = r.values < -1.0
        with monitor.child(1).observing("task 2"):
            r.values[negativ_r] = -1.0
        with monitor.child(1).observing("task 3"):
            positiv_r = r.values > 1.0
        with monitor.child(1).observing("task 4"):
            r.values[positiv_r] = 1.0
        np.seterr(**default_warning_settings)
        r.attrs = {
            'description':
            'Correlation coefficients between'
            ' {} and {}.'.format(x.name, y.name)
        }

        df = n - 2
        t_squared = xr.ufuncs.square(r) * (df / ((1.0 - r.where(r != 1)) *
                                                 (1.0 + r.where(r != -1))))
        prob = df / (df + t_squared)
        with monitor.child(1).observing("task 5"):
            prob_values_in = prob.values
        with monitor.child(1).observing("task 6"):
            prob.values = betainc(0.5 * df, 0.5, prob_values_in)
        prob.attrs = {
            'description':
            'Rough indicator of probability of an'
            ' uncorrelated system producing datasets that have a Pearson'
            ' correlation at least as extreme as the one computed from'
            ' these datsets. Not entirely reliable, but reasonable for'
            ' datasets larger than 500 or so.'
        }

        retset = xr.Dataset({'corr_coef': r, 'p_value': prob})
    return retset
Example #14
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low)
                                             & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds