Ejemplo n.º 1
0
    def test_query_data_sources_default_data_store(self):
        size_before = len(ds.DATA_STORE_REGISTRY)
        orig_stores = list(ds.DATA_STORE_REGISTRY.get_data_stores())
        try:
            ds.DATA_STORE_REGISTRY._data_stores.clear()
            self.assertEqual(0, len(ds.DATA_STORE_REGISTRY))

            from cate.ds.esa_cci_ftp import set_default_data_store as set_default_data_store_ftp
            set_default_data_store_ftp()
            self.assertEqual(1, len(ds.DATA_STORE_REGISTRY))

            data_sources = ds.query_data_sources()
            self.assertIsNotNone(data_sources)
            self.assertEqual(len(data_sources), 98)
            self.assertEqual(data_sources[0].name, "AEROSOL_ATSR2_SU_L3_V4.2_DAILY")

            data_sources = ds.query_data_sources(name="AEROSOL_ATSR2_SU_L3_V4.2_DAILY")
            self.assertIsNotNone(data_sources)
            self.assertEqual(len(data_sources), 1)

            data_sources = ds.query_data_sources(name="ZZ")
            self.assertIsNotNone(data_sources)
            self.assertEqual(len(data_sources), 0)
        finally:
            ds.DATA_STORE_REGISTRY._data_stores.clear()
            for data_store in orig_stores:
                ds.DATA_STORE_REGISTRY.add_data_store(data_store)
        self.assertEqual(size_before, len(ds.DATA_STORE_REGISTRY))
Ejemplo n.º 2
0
    def test_query_data_sources_with_constrains(self):
        data_sources = ds.query_data_sources(data_stores=self.TEST_DATA_STORE, name="aerosol")
        self.assertIsNotNone(data_sources)
        self.assertEqual(len(data_sources), 1)
        self.assertEqual(data_sources[0].name, "aerosol")

        data_sources = ds.query_data_sources(data_stores=self.TEST_DATA_STORE, name="ozone")
        self.assertIsNotNone(data_sources)
        self.assertEqual(len(data_sources), 1)
        self.assertEqual(data_sources[0].name, "ozone")

        data_sources = ds.query_data_sources(data_stores=self.TEST_DATA_STORE, name="Z")
        self.assertIsNotNone(data_sources)
        self.assertEqual(len(data_sources), 0)
Ejemplo n.º 3
0
 def test_query_data_sources_with_data_store_list(self):
     data_stores = [self.TEST_DATA_STORE, self.TEST_DATA_STORE_SST]
     data_sources = ds.query_data_sources(data_stores=data_stores)
     self.assertIsNotNone(data_sources)
     self.assertEqual(len(data_sources), 3)
     self.assertEqual(data_sources[0].name, "aerosol")
     self.assertEqual(data_sources[1].name, "ozone")
     self.assertEqual(data_sources[2].name, "sst")
Ejemplo n.º 4
0
    def make_ds_local(self,
                      data_source_name: str,
                      local_name: str,
                      args: dict,
                      monitor: Monitor = Monitor.NONE) -> list:
        """
        Turns a (likely remote) data source into a local data source given a name and a number of
        optional constraints.

        :param data_source_name: The name of the source data source.
        :param local_name: A human readable name for the new local data source.
        :param args: A dict containing the constraints
        :param monitor: a progress monitor.
        :return: JSON-serializable list of 'local' data sources, sorted by name.
        """
        with monitor.starting('Making data source local', 100):
            data_sources = query_data_sources(name=data_source_name)
            if not data_sources:
                raise ValueError('data source "%s" not found' %
                                 data_source_name)
            if len(data_sources) > 1:
                raise ValueError(
                    'Multiple data sources with the name "%s" found' %
                    data_source_name)
            data_source = data_sources[0]

            time_range = None
            if 'start_date' in args and 'end_date' in args:
                time_range = (args['start_date'], args['end_date'])
            region = None
            if 'region' in args:
                region = args['region']
            var_names = None
            if 'var' in args:
                var_names = args['var']

            local_data_source = data_source.make_local(
                local_name=local_name,
                time_range=time_range,
                region=region,
                var_names=var_names,
                monitor=monitor.child(98))

            print('local_data_source', local_data_source)
            return self.get_data_sources('local', monitor=monitor.child(2))
Ejemplo n.º 5
0
    def update_local(self,
                     local_id: str,
                     time_range: TimeRangeLike.TYPE,
                     monitor: Monitor = Monitor.NONE) -> bool:

        data_sources = query_data_sources(
            None, local_id)  # type: Sequence['DataSource']
        data_source = next(
            (ds for ds in data_sources
             if isinstance(ds, LocalDataSource) and ds.name == local_id),
            None)  # type: LocalDataSource
        if not data_source:
            raise ValueError("Couldn't find local DataSource",
                             (local_id, data_sources))

        time_range = TimeRangeLike.convert(time_range) if time_range else None

        to_remove = []
        to_add = []
        if time_range and time_range[1] > time_range[0]:
            if time_range[0] != data_source.temporal_coverage()[0]:
                if time_range[0] > data_source.temporal_coverage()[0]:
                    to_remove.append(
                        (data_source.temporal_coverage()[0], time_range[0]))
                else:
                    to_add.append(
                        (time_range[0], data_source.temporal_coverage()[0]))

            if time_range[1] != data_source.temporal_coverage()[1]:
                if time_range[1] < data_source.temporal_coverage()[1]:
                    to_remove.append(
                        (time_range[1], data_source.temporal_coverage()[1]))
                else:
                    to_add.append(
                        (data_source.temporal_coverage()[1], time_range[1]))
        if to_remove:
            for time_range_to_remove in to_remove:
                data_source.reduce_temporal_coverage(time_range_to_remove)
        if to_add:
            for time_range_to_add in to_add:
                self._make_local(data_source, time_range_to_add, None,
                                 data_source.variables_info, monitor)
        return bool(to_remove or to_add)
Ejemplo n.º 6
0
def long_term_average(source: str,
                      year_min: int,
                      year_max: int,
                      file: str,
                      var: VarNamesLike.TYPE = None,
                      save: bool = False,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform the long term monthly average of the given monthly or daily data
    source for the given range of years.

    Depending on the given year range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    :param source: The data source from which to extract the monthly average
    :param year_min: The earliest year of the desired time range
    :param year_max: The most recent year of the desired time range
    :param file: filepath where to save the long term average dataset
    :param var: If given, only these variable names will be preserved in the
    output.
    :param save: If True, saves the data downloaded during this operation. This
    can potentially be a very large amount of data.
    :param monitor: A progress monitor to use
    :return: The Long Term Average dataset.
    """
    var = VarNamesLike.convert(var)

    n_years = year_max - year_min + 1
    res = 0
    total_work = 100

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query\
                         term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query\
                         term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # Check if we have a monthly data source
    fq = data_source.meta_info['time_frequency']
    if fq != 'mon':
        raise ValueError("Only monthly datasets are supported for time being.")

    with monitor.starting('LTA', total_work=total_work):
        # Set up the monitor
        monitor.progress(work=0)
        step = total_work * 0.9 / n_years

        # Process the data source year by year
        year = year_min
        while year != year_max + 1:

            tmin = "{}-01-01".format(year)
            tmax = "{}-12-31".format(year)

            # Determine if the data for the given year are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this year, we do the syncing anyway.
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough
                    break

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

            try:
                if res == 0:
                    res = ds / n_years
                else:
                    # Xarray doesn't do automatic alignment for in place
                    # operations, hence we have to do it manually
                    res = res + ds.reindex_like(res) / n_years
            except TypeError:
                raise TypeError('One or more data arrays feature a dtype that\
                                can not be divided. Consider using the var\
                                parameter to filter the dataset.')

            ds.close()
            # delete data for the current year, if it should be deleted and it
            # was not already downloaded.
            if (not save) and (not was_already_downloaded):
                data_source.delete_local(dt_range)

            monitor.progress(work=step * 0.1)

            year = year + 1

        monitor.progress(msg='Saving the LTA dataset')
        save_dataset(res, file)
        monitor.progress(total_work * 0.1)

    return res
Ejemplo n.º 7
0
def temporal_agg(source: str,
                 start_date: str = None,
                 end_date: str = None,
                 var: VarNamesLike.TYPE = None,
                 level: str = 'mon',
                 method: str = 'mean',
                 save_data: bool = False,
                 monitor: Monitor = Monitor.NONE) -> (xr.Dataset, str):
    """
    Perform temporal aggregation of the given data source to the given level
    using the given method for the given time range. Only full time periods
    of the given time range will be aggregated.

    Depending on the given time range, data size, as well as internet
    connection quality, this operation can potentially take a very long time
    to finish.

    Careful consideration is needed in choosing the var parameter to create
    meaningful outputs. This is unique for each data source.

    The aggregation result is saved into the local data store for later reuse.

    :param source: Data source to aggregate
    :param start_date: Start date of aggregation. If not given, data source
    start date is used instead
    :param end_date: End date of aggregation. If not given, data source end
    date is used instead
    :param var: If given, only these dataset variables will be preserved in the
    result
    :param level: Aggregation level
    :param method: Aggregation method
    :param save_data: Whether to save data downloaded during this operation.
    This can potentially be a lot of data.
    :param monitor: A progress monitor to use
    :return: The local data source identifier for the aggregated data
    """
    # Raise not implemented, while not finished
    raise ValueError("Operation is not implemented.")

    var = VarNamesLike.convert(var)

    # Select the appropriate data source
    data_store_list = DATA_STORE_REGISTRY.get_data_stores()
    data_sources = query_data_sources(data_store_list, name=source)
    if len(data_sources) == 0:
        raise ValueError("No data_source found for the given query "
                         "term {}".format(source))
    elif len(data_sources) > 1:
        raise ValueError("{} data_sources found for the given query "
                         "term {}".format(data_sources, source))

    data_source = data_sources[0]
    source_info = data_source.cache_info

    # We have to do this to have temporal coverage info in meta_info
    data_source._init_file_list()

    # Check if the data source temporal resolution is known
    known_res = ('day', '8-days', 'mon', 'yr')

    fq = data_source.meta_info['time_frequency']
    if (not fq) or (fq not in known_res):
        raise ValueError("The given data source features unknown time "
                         "resolution: {}".format(fq))

    # Check if the operation supports the desired aggregation step
    valid_steps = list()
    valid_steps.append(('day', 'mon'))
    if (fq, level) not in valid_steps:
        raise ValueError("Currently the operation does not support aggregation"
                         " from {} to {}".format(fq, level))

    # Determine start and end dates
    if not start_date:
        start_date = data_source.meta_info['temporal_coverage_start']
    start_date = to_datetime(start_date)
    # If start_date is not start of the month, move it to the 1st of next
    # month
    if start_date.day != 1:
        try:
            start_date = datetime(start_date.year, start_date.month + 1, 1)
        except ValueError:
            # We have tried to set the month to 13
            start_date = datetime(start_date.year + 1, 1, 1)

    if not end_date:
        end_date = data_source.meta_info['temporal_coverage_end']
    end_date = to_datetime(end_date)
    # If end date is not end of the month, move it to the last day of the
    # previous month
    if not _is_end_of_month(end_date):
        try:
            end_date = datetime(end_date.year, end_date.month - 1, 27)
        except ValueError:
            # We have tried to set the month to 0
            end_date = datetime(end_date.year - 1, 12, 31)

    end_date = _end_of_month(end_date.year, end_date.month)

    # Determine the count of processing periods
    n_periods = (end_date.year - start_date.year + 1) * 12\
        + end_date.month - start_date.month - 11
    # 2000-4-1, 2000-6-30 -> 12 + 2 -11 = 3

    if n_periods < 1:
        raise ValueError("The given time range does not contain any full "
                         "calendar months to do aggregation with.")

    # Set up the monitor
    total_work = 100
    with monitor.starting('Aggregate', total_work=total_work):
        monitor.progress(work=0)
        step = total_work * 0.9 / n_periods

        # Process the data source period by period
        tmin = start_date
        while tmin < end_date:
            tmax = _end_of_month(tmin.year, tmin.month)

            # Determine if the data for the given period are already downloaded
            # If at least one file of the given time range is present, we
            # don't delete the data for this period, we do the syncing anyway
            was_already_downloaded = False
            dt_range = to_datetime_range(tmin, tmax)
            for date in source_info:
                if dt_range[0] <= date <= dt_range[1]:
                    was_already_downloaded = True
                    # One is enough
                    break

            worked = monitor._worked
            data_source.sync(dt_range, monitor=monitor.child(work=step * 0.9))
            if worked == monitor._worked:
                monitor.progress(work=step * 0.9)

            ds = data_source.open_dataset(dt_range)

            # Filter the dataset
            ds = select_var(ds, var)

            # Do the aggregation

            # Save the dataset for this period into local data store

            # Close and delete the files if needed
            ds.close()
            # delete data for the current period,if it should be deleted and it
            # was not already downloaded.
            if (not save_data) and (not was_already_downloaded):
                data_source.delete_local(dt_range)

            monitor.progress(work=step * 0.1)

            # tmin for next iteration
            try:
                tmin = datetime(tmin.year, tmin.month + 1, 1)
            except ValueError:
                # Couldn't add a month -> end of year
                tmin = datetime(tmin.year + 1, 1, 1)
            pass

    monitor.progress(work=step * 0.1)

    # Return the local data source id
    return None
Ejemplo n.º 8
0
 def test_query_data_sources_with_data_store_value(self):
     data_sources = ds.query_data_sources(data_stores=self.TEST_DATA_STORE)
     self.assertIsNotNone(data_sources)
     self.assertEqual(len(data_sources), 2)
     self.assertEqual(data_sources[0].name, "aerosol")
     self.assertEqual(data_sources[1].name, "ozone")