Example #1
0
    def getTimeSeriesStatsForBoxSingleDataSet(self,
                                              min_lat,
                                              max_lat,
                                              min_lon,
                                              max_lon,
                                              ds,
                                              start_time=0,
                                              end_time=-1,
                                              applySeasonalFilter=True,
                                              applyLowPass=True,
                                              fill=-9999.,
                                              spark_master="local[1]",
                                              spark_nexecs=1,
                                              spark_nparts=1):

        daysinrange = self._tile_service.find_days_in_range_asc(
            min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time)

        ndays = len(daysinrange)
        if ndays == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} days in range'.format(ndays))
        for i, d in enumerate(daysinrange):
            self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d)))
        spark_nparts_needed = min(spark_nparts, ndays)
        nexus_tiles_spark = [(min_lat, max_lat, min_lon, max_lon, ds,
                              list(daysinrange_part), fill)
                             for daysinrange_part in np.array_split(
                                 daysinrange, spark_nparts_needed)]

        # Launch Spark computations
        rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts_needed)
        results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect()
        #
        results = list(itertools.chain.from_iterable(results))
        results = sorted(results, key=lambda entry: entry["time"])

        filt.applyAllFiltersOnField(results,
                                    'mean',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results,
                                    'max',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results,
                                    'min',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)

        self._create_nc_file_time1d(np.array(results),
                                    'ts.nc',
                                    'mean',
                                    fill=-9999.)
        return results, {}
Example #2
0
    def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1,
                                              applySeasonalFilter=True, applyLowPass=True):

        daysinrange = self._tile_service.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time,
                                                                end_time)

        if len(daysinrange) == 0:
            raise NoDataException(reason="No data found for selected timeframe")

        maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses"))

        results = []
        if maxprocesses == 1:
            calculator = TimeSeriesCalculator()
            for dayinseconds in daysinrange:
                result = calculator.calc_average_on_day(min_lat, max_lat, min_lon, max_lon, ds, dayinseconds)
                results.append(result)
        else:
            # Create a task to calc average difference for each day
            manager = Manager()
            work_queue = manager.Queue()
            done_queue = manager.Queue()
            for dayinseconds in daysinrange:
                work_queue.put(
                    ('calc_average_on_day', min_lat, max_lat, min_lon, max_lon, ds, dayinseconds))
            [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)]

            # Start new processes to handle the work
            pool = Pool(maxprocesses)
            [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses)]
            pool.close()

            # Collect the results as [(day (in ms), average difference for that day)]
            for i in xrange(0, len(daysinrange)):
                result = done_queue.get()
                try:
                    error_str = result['error']
                    self.log.error(error_str)
                    raise NexusProcessingException(reason="Error calculating average by day.")
                except KeyError:
                    pass

                results.append(result)

            pool.terminate()
            manager.shutdown()

        results = sorted(results, key=lambda entry: entry["time"])

        filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)

        return results, {}
Example #3
0
    def getTimeSeriesStatsForBoxSingleDataSet(self, bounding_polygon, ds, start_seconds_from_epoch,
                                              end_seconds_from_epoch,
                                              apply_seasonal_cycle_filter=True, apply_low_pass_filter=True):

        the_time = datetime.now()
        daysinrange = self._get_tile_service().find_days_in_range_asc(bounding_polygon.bounds[1],
                                                                bounding_polygon.bounds[3],
                                                                bounding_polygon.bounds[0],
                                                                bounding_polygon.bounds[2],
                                                                ds,
                                                                start_seconds_from_epoch,
                                                                end_seconds_from_epoch)
        logger.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        if len(daysinrange) == 0:
            raise NoDataException(reason="No data found for selected timeframe")

        the_time = datetime.now()
        maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses"))

        results = []
        if maxprocesses == 1:
            calculator = TimeSeriesCalculator()
            for dayinseconds in daysinrange:
                result = calculator.calc_average_on_day(bounding_polygon.wkt, ds, dayinseconds)
                results += [result] if result else []
        else:
            # Create a task to calc average difference for each day
            manager = Manager()
            work_queue = manager.Queue()
            done_queue = manager.Queue()
            for dayinseconds in daysinrange:
                work_queue.put(
                    ('calc_average_on_day', bounding_polygon.wkt, ds, dayinseconds))
            [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)]

            # Start new processes to handle the work
            pool = Pool(maxprocesses)
            [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in range(0, maxprocesses)]
            pool.close()

            # Collect the results as [(day (in ms), average difference for that day)]
            for i in range(0, len(daysinrange)):
                result = done_queue.get()
                try:
                    error_str = result['error']
                    logger.error(error_str)
                    raise NexusProcessingException(reason="Error calculating average by day.")
                except KeyError:
                    pass

                results += [result] if result else []

            pool.terminate()
            manager.shutdown()

        results = sorted(results, key=lambda entry: entry["time"])
        logger.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        if apply_seasonal_cycle_filter:
            the_time = datetime.now()
            for result in results:
                month = datetime.utcfromtimestamp(result['time']).month
                month_mean, month_max, month_min = self.calculate_monthly_average(month, bounding_polygon.wkt, ds)
                seasonal_mean = result['mean'] - month_mean
                seasonal_min = result['min'] - month_min
                seasonal_max = result['max'] - month_max
                result['meanSeasonal'] = seasonal_mean
                result['minSeasonal'] = seasonal_min
                result['maxSeasonal'] = seasonal_max
            logger.info(
                "Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        the_time = datetime.now()
        filtering.applyAllFiltersOnField(results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter)
        filtering.applyAllFiltersOnField(results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter)
        filtering.applyAllFiltersOnField(results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter)

        if apply_seasonal_cycle_filter and apply_low_pass_filter:
            try:
                filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
                filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
                filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
            except Exception as e:
                # If it doesn't work log the error but ignore it
                tb = traceback.format_exc()
                logger.warn("Error calculating SeasonalLowPass filter:\n%s" % tb)

        logger.info(
            "LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        return results, {}
    def calc(self, request, **args):
        """

        :param request: StatsComputeOptions
        :param args: dict
        :return:
        """
        start_time = datetime.now()
        ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested, normalize_dates = self.parse_arguments(
            request)
        metrics_record = self._create_metrics_record()

        resultsRaw = []

        for shortName in ds:

            the_time = datetime.now()
            daysinrange = self._get_tile_service().find_days_in_range_asc(
                bounding_polygon.bounds[1],
                bounding_polygon.bounds[3],
                bounding_polygon.bounds[0],
                bounding_polygon.bounds[2],
                shortName,
                start_seconds_from_epoch,
                end_seconds_from_epoch,
                metrics_callback=metrics_record.record_metrics)
            self.log.info("Finding days in range took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            ndays = len(daysinrange)
            if ndays == 0:
                raise NoDataException(
                    reason="No data found for selected timeframe")

            self.log.debug('Found {0} days in range'.format(ndays))
            for i, d in enumerate(daysinrange):
                self.log.debug('{0}, {1}'.format(i,
                                                 datetime.utcfromtimestamp(d)))
            spark_nparts = self._spark_nparts(nparts_requested)
            self.log.info('Using {} partitions'.format(spark_nparts))
            results, meta = spark_driver(daysinrange,
                                         bounding_polygon,
                                         shortName,
                                         self._tile_service_factory,
                                         metrics_record.record_metrics,
                                         normalize_dates,
                                         spark_nparts=spark_nparts,
                                         sc=self._sc)

            if apply_seasonal_cycle_filter:
                the_time = datetime.now()
                # get time series for _clim dataset
                shortName_clim = shortName + "_clim"
                daysinrange_clim = self._get_tile_service(
                ).find_days_in_range_asc(
                    bounding_polygon.bounds[1],
                    bounding_polygon.bounds[3],
                    bounding_polygon.bounds[0],
                    bounding_polygon.bounds[2],
                    shortName_clim,
                    0,
                    SECONDS_IN_ONE_YEAR,
                    metrics_callback=metrics_record.record_metrics)
                if len(daysinrange_clim) == 0:
                    raise NexusProcessingException(
                        reason=
                        "There is no climatology data present for dataset " +
                        shortName + ".")
                results_clim, _ = spark_driver(daysinrange_clim,
                                               bounding_polygon,
                                               shortName_clim,
                                               self._tile_service_factory,
                                               metrics_record.record_metrics,
                                               normalize_dates=False,
                                               spark_nparts=spark_nparts,
                                               sc=self._sc)
                clim_indexed_by_month = {
                    datetime.utcfromtimestamp(result['time']).month: result
                    for result in results_clim
                }
                if len(clim_indexed_by_month) < 12:
                    raise NexusProcessingException(
                        reason="There are only " + len(clim_indexed_by_month) +
                        " months of climatology data for dataset " +
                        shortName +
                        ". A full year of climatology data is required for computing deseasoned timeseries."
                    )

                for result in results:
                    month = datetime.utcfromtimestamp(result['time']).month

                    result['meanSeasonal'] = result[
                        'mean'] - clim_indexed_by_month[month]['mean']
                    result['minSeasonal'] = result[
                        'min'] - clim_indexed_by_month[month]['min']
                    result['maxSeasonal'] = result[
                        'max'] - clim_indexed_by_month[month]['max']
                self.log.info("Seasonal calculation took %s for dataset %s" %
                              (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            filtering.applyAllFiltersOnField(
                results,
                'mean',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'max',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'min',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)

            if apply_seasonal_cycle_filter and apply_low_pass_filter:
                try:
                    filtering.applyFiltersOnField(results,
                                                  'meanSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'minSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'maxSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                except Exception as e:
                    # If it doesn't work log the error but ignore it
                    tb = traceback.format_exc()
                    self.log.warn(
                        "Error calculating SeasonalLowPass filter:\n%s" % tb)

            resultsRaw.append([results, meta])
            self.log.info("LowPass filter calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            self._create_nc_file_time1d(np.array(results),
                                        'ts.nc',
                                        'mean',
                                        fill=-9999.)
            self.log.info("NetCDF generation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

        the_time = datetime.now()
        results = self._mergeResults(resultsRaw)

        if len(ds) == 2:
            try:
                stats = TimeSeriesSparkHandlerImpl.calculate_comparison_stats(
                    results)
            except Exception:
                stats = {}
                tb = traceback.format_exc()
                self.log.warn("Error when calculating comparison stats:\n%s" %
                              tb)
        else:
            stats = {}

        meta = []
        for singleRes in resultsRaw:
            meta.append(singleRes[1])

        res = TimeSeriesResults(results=results,
                                meta=meta,
                                stats=stats,
                                computeOptions=None,
                                minLat=bounding_polygon.bounds[1],
                                maxLat=bounding_polygon.bounds[3],
                                minLon=bounding_polygon.bounds[0],
                                maxLon=bounding_polygon.bounds[2],
                                ds=ds,
                                startTime=start_seconds_from_epoch,
                                endTime=end_seconds_from_epoch)

        total_duration = (datetime.now() - start_time).total_seconds()
        metrics_record.record_metrics(actual_time=total_duration)
        metrics_record.print_metrics(logger)

        self.log.info("Merging results and calculating comparisons took %s" %
                      (str(datetime.now() - the_time)))
        return res
Example #5
0
    def calc(self, request, **args):
        """
    
        :param request: StatsComputeOptions
        :param args: dict
        :return:
        """

        ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested = self.parse_arguments(
            request)

        resultsRaw = []

        for shortName in ds:

            the_time = datetime.now()
            daysinrange = self._tile_service.find_days_in_range_asc(
                bounding_polygon.bounds[1], bounding_polygon.bounds[3],
                bounding_polygon.bounds[0], bounding_polygon.bounds[2],
                shortName, start_seconds_from_epoch, end_seconds_from_epoch)
            self.log.info("Finding days in range took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            ndays = len(daysinrange)
            if ndays == 0:
                raise NoDataException(
                    reason="No data found for selected timeframe")

            self.log.debug('Found {0} days in range'.format(ndays))
            for i, d in enumerate(daysinrange):
                self.log.debug('{0}, {1}'.format(i,
                                                 datetime.utcfromtimestamp(d)))
            spark_nparts = self._spark_nparts(nparts_requested)
            self.log.info('Using {} partitions'.format(spark_nparts))
            the_time = datetime.now()
            results, meta = spark_driver(daysinrange,
                                         bounding_polygon,
                                         shortName,
                                         spark_nparts=spark_nparts,
                                         sc=self._sc)
            self.log.info("Time series calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            if apply_seasonal_cycle_filter:
                the_time = datetime.now()
                for result in results:
                    month = datetime.utcfromtimestamp(result['time']).month
                    month_mean, month_max, month_min = self.calculate_monthly_average(
                        month, bounding_polygon.wkt, shortName)
                    seasonal_mean = result['mean'] - month_mean
                    seasonal_min = result['min'] - month_min
                    seasonal_max = result['max'] - month_max
                    result['meanSeasonal'] = seasonal_mean
                    result['minSeasonal'] = seasonal_min
                    result['maxSeasonal'] = seasonal_max
                self.log.info("Seasonal calculation took %s for dataset %s" %
                              (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            filtering.applyAllFiltersOnField(
                results,
                'mean',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'max',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'min',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)

            if apply_seasonal_cycle_filter and apply_low_pass_filter:
                try:
                    filtering.applyFiltersOnField(results,
                                                  'meanSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'minSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'maxSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                except Exception as e:
                    # If it doesn't work log the error but ignore it
                    tb = traceback.format_exc()
                    self.log.warn(
                        "Error calculating SeasonalLowPass filter:\n%s" % tb)

            resultsRaw.append([results, meta])
            self.log.info("LowPass filter calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            self._create_nc_file_time1d(np.array(results),
                                        'ts.nc',
                                        'mean',
                                        fill=-9999.)
            self.log.info("NetCDF generation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

        the_time = datetime.now()
        results = self._mergeResults(resultsRaw)

        if len(ds) == 2:
            try:
                stats = TimeSeriesHandlerImpl.calculate_comparison_stats(
                    results)
            except Exception:
                stats = {}
                tb = traceback.format_exc()
                self.log.warn("Error when calculating comparison stats:\n%s" %
                              tb)
        else:
            stats = {}

        meta = []
        for singleRes in resultsRaw:
            meta.append(singleRes[1])

        res = TimeSeriesResults(results=results,
                                meta=meta,
                                stats=stats,
                                computeOptions=None,
                                minLat=bounding_polygon.bounds[1],
                                maxLat=bounding_polygon.bounds[3],
                                minLon=bounding_polygon.bounds[0],
                                maxLon=bounding_polygon.bounds[2],
                                ds=ds,
                                startTime=start_seconds_from_epoch,
                                endTime=end_seconds_from_epoch)

        self.log.info("Merging results and calculating comparisons took %s" %
                      (str(datetime.now() - the_time)))
        return res