コード例 #1
0
    def calc(self, compute_options, **args):
        ds, bbox, start_time, end_time = self.parse_arguments(compute_options)

        min_lon, min_lat, max_lon, max_lat = bbox.bounds

        nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in
                             enumerate(self._tile_service.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon,
                                                                            ds, start_time, end_time,
                                                                            fetch_data=False))]

        print ("Got {} tiles".format(len(nexus_tiles_spark)))
        if len(nexus_tiles_spark) == 0:
            raise NoDataException(reason="No data found for selected timeframe")

        results = spark_driver(self._sc, self._latlon, nexus_tiles_spark)

        results = filter(None, results)
        results = sorted(results, key=lambda entry: entry["time"])
        for i in range(len(results)):
            results[i]['lons'] = sorted(results[i]['lons'],
                                        key=lambda entry: entry['longitude'])

        # Deseason disabled. See SDAP-148
        # results = self.applyDeseasonToHofMoeller(results, pivot="lons")

        result = HoffMoellerResults(results=results, compute_options=None, type=HoffMoellerResults.LONGITUDE,
                                    minLat=min_lat, maxLat=max_lat, minLon=min_lon,
                                    maxLon=max_lon, ds=ds, startTime=start_time, endTime=end_time)
        return result
コード例 #2
0
    def calc(self, compute_options, **args):
        ds, bbox, start_time, end_time, normalize_dates = self.parse_arguments(
            compute_options)

        metrics_record = self._create_metrics_record()
        calculation_start = datetime.now()

        min_lon, min_lat, max_lon, max_lat = bbox.bounds

        nexus_tiles_spark = [
            (self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon)
            for x, tile in enumerate(self._get_tile_service(
            ).find_tiles_in_box(min_lat,
                                max_lat,
                                min_lon,
                                max_lon,
                                ds,
                                start_time,
                                end_time,
                                metrics_callback=metrics_record.record_metrics,
                                fetch_data=False))
        ]

        print(("Got {} tiles".format(len(nexus_tiles_spark))))
        if len(nexus_tiles_spark) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        results = spark_driver(self._sc, self._latlon,
                               self._tile_service_factory, nexus_tiles_spark,
                               metrics_record.record_metrics, normalize_dates)

        results = [_f for _f in results if _f]
        results = sorted(results, key=lambda entry: entry["time"])
        for i in range(len(results)):
            results[i]['lons'] = sorted(results[i]['lons'],
                                        key=lambda entry: entry['longitude'])

        # Deseason disabled. See SDAP-148
        # results = self.applyDeseasonToHofMoeller(results, pivot="lons")

        result = HoffMoellerResults(results=results,
                                    compute_options=None,
                                    type=HoffMoellerResults.LONGITUDE,
                                    minLat=min_lat,
                                    maxLat=max_lat,
                                    minLon=min_lon,
                                    maxLon=max_lon,
                                    ds=ds,
                                    startTime=start_time,
                                    endTime=end_time)

        duration = (datetime.now() - calculation_start).total_seconds()
        metrics_record.record_metrics(actual_time=duration)
        metrics_record.print_metrics(self.log)

        return result
コード例 #3
0
    def getTimeSeriesStatsForBoxSingleDataSet(self,
                                              min_lat,
                                              max_lat,
                                              min_lon,
                                              max_lon,
                                              ds,
                                              start_time=0,
                                              end_time=-1,
                                              applySeasonalFilter=True,
                                              applyLowPass=True,
                                              fill=-9999.,
                                              spark_master="local[1]",
                                              spark_nexecs=1,
                                              spark_nparts=1):

        daysinrange = self._tile_service.find_days_in_range_asc(
            min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time)

        ndays = len(daysinrange)
        if ndays == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} days in range'.format(ndays))
        for i, d in enumerate(daysinrange):
            self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d)))
        spark_nparts_needed = min(spark_nparts, ndays)
        nexus_tiles_spark = [(min_lat, max_lat, min_lon, max_lon, ds,
                              list(daysinrange_part), fill)
                             for daysinrange_part in np.array_split(
                                 daysinrange, spark_nparts_needed)]

        # Launch Spark computations
        rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts_needed)
        results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect()
        #
        results = list(itertools.chain.from_iterable(results))
        results = sorted(results, key=lambda entry: entry["time"])

        filt.applyAllFiltersOnField(results,
                                    'mean',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results,
                                    'max',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results,
                                    'min',
                                    applySeasonal=applySeasonalFilter,
                                    applyLowPass=applyLowPass)

        self._create_nc_file_time1d(np.array(results),
                                    'ts.nc',
                                    'mean',
                                    fill=-9999.)
        return results, {}
コード例 #4
0
    def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1,
                                              applySeasonalFilter=True, applyLowPass=True):

        daysinrange = self._tile_service.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time,
                                                                end_time)

        if len(daysinrange) == 0:
            raise NoDataException(reason="No data found for selected timeframe")

        maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses"))

        results = []
        if maxprocesses == 1:
            calculator = TimeSeriesCalculator()
            for dayinseconds in daysinrange:
                result = calculator.calc_average_on_day(min_lat, max_lat, min_lon, max_lon, ds, dayinseconds)
                results.append(result)
        else:
            # Create a task to calc average difference for each day
            manager = Manager()
            work_queue = manager.Queue()
            done_queue = manager.Queue()
            for dayinseconds in daysinrange:
                work_queue.put(
                    ('calc_average_on_day', min_lat, max_lat, min_lon, max_lon, ds, dayinseconds))
            [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)]

            # Start new processes to handle the work
            pool = Pool(maxprocesses)
            [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses)]
            pool.close()

            # Collect the results as [(day (in ms), average difference for that day)]
            for i in xrange(0, len(daysinrange)):
                result = done_queue.get()
                try:
                    error_str = result['error']
                    self.log.error(error_str)
                    raise NexusProcessingException(reason="Error calculating average by day.")
                except KeyError:
                    pass

                results.append(result)

            pool.terminate()
            manager.shutdown()

        results = sorted(results, key=lambda entry: entry["time"])

        filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)

        return results, {}
コード例 #5
0
    def calc(self, computeOptions, **args):
        """

        :param computeOptions: StatsComputeOptions
        :param args: dict
        :return:
        """

        self._minLat = float(computeOptions.get_min_lat())
        self._maxLat = float(computeOptions.get_max_lat())
        self._minLon = float(computeOptions.get_min_lon())
        self._maxLon = float(computeOptions.get_max_lon())
        self._ds = computeOptions.get_dataset()[0]
        self._startTime = computeOptions.get_start_time()
        self._endTime = computeOptions.get_end_time()

        self._find_native_resolution()
        print 'Using Native resolution: lat_res=%f, lon_res=%f' % (
            self._latRes, self._lonRes)
        self._minLatCent = self._minLat + self._latRes / 2
        self._minLonCent = self._minLon + self._lonRes / 2
        nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1
        nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1
        self._maxLatCent = self._minLatCent + (nlats - 1) * self._latRes
        self._maxLonCent = self._minLonCent + (nlons - 1) * self._lonRes
        print 'nlats=', nlats, 'nlons=', nlons
        print 'center lat range = %f to %f' % (self._minLatCent,
                                               self._maxLatCent)
        print 'center lon range = %f to %f' % (self._minLonCent,
                                               self._maxLonCent)
        sys.stdout.flush()
        a = np.zeros((nlats, nlons), dtype=np.float64, order='C')

        nexus_tiles = self._find_global_tile_set()
        # print 'tiles:'
        # for tile in nexus_tiles:
        #     print tile.granule
        #     print tile.section_spec
        #     print 'lat:', tile.latitudes
        #     print 'lon:', tile.longitudes

        #                                                          nexus_tiles)
        if len(nexus_tiles) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        print 'Initially found %d tiles' % len(nexus_tiles)
        sys.stdout.flush()
        self._prune_tiles(nexus_tiles)
        print 'Pruned to %d tiles' % len(nexus_tiles)
        sys.stdout.flush()
        #for tile in nexus_tiles:
        #    print 'lats: ', tile.latitudes.compressed()
        #    print 'lons: ', tile.longitudes.compressed()

        avg_tiles = map(self._map, nexus_tiles)
        print 'shape a = ', a.shape
        sys.stdout.flush()
        # The tiles below are NOT Nexus objects.  They are tuples
        # with the time avg map data and lat-lon bounding box.
        for tile in avg_tiles:
            if tile is not None:
                (tile_data, tile_min_lat, tile_max_lat, tile_min_lon,
                 tile_max_lon) = tile
                print 'shape tile_data = ', tile_data.shape
                print 'tile data mask = ', tile_data.mask
                sys.stdout.flush()
                if np.any(np.logical_not(tile_data.mask)):
                    y0 = self._lat2ind(tile_min_lat)
                    y1 = self._lat2ind(tile_max_lat)
                    x0 = self._lon2ind(tile_min_lon)
                    x1 = self._lon2ind(tile_max_lon)
                    print 'writing tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \
                        (tile_min_lat, tile_max_lat,
                         tile_min_lon, tile_max_lon, y0, y1, x0, x1)
                    sys.stdout.flush()
                    a[y0:y1 + 1, x0:x1 + 1] = tile_data
                else:
                    print 'All pixels masked in tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \
                        (tile_min_lat, tile_max_lat,
                         tile_min_lon, tile_max_lon, y0, y1, x0, x1)
                    sys.stdout.flush()

        self._create_nc_file(a)

        return TimeAvgMapResults(results={},
                                 meta={},
                                 computeOptions=computeOptions)
コード例 #6
0
ファイル: LongitudeLatitudeMap.py プロジェクト: hdfeos/nexus
    def calc(self, computeOptions, **args):
        minLat = computeOptions.get_min_lat()
        maxLat = computeOptions.get_max_lat()
        minLon = computeOptions.get_min_lon()
        maxLon = computeOptions.get_max_lon()
        ds = computeOptions.get_dataset()[0]
        startTime = computeOptions.get_start_time()
        endTime = computeOptions.get_end_time()
        maskLimitType = computeOptions.get_mask_type()

        chunks, meta = self.getChunksForBox(minLat,
                                            maxLat,
                                            minLon,
                                            maxLon,
                                            ds,
                                            startTime=startTime,
                                            endTime=endTime)

        if len(chunks) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        masker = LandMaskChecker(self._landmask, maskLimitType)
        a = self._allocateArray(int(math.ceil(maxLat - minLat)),
                                int(math.ceil(maxLon - minLon)))
        lat = minLat
        y = 0
        x = 0
        while lat < maxLat:
            lon = minLon
            x = 0
            while lon < maxLon:

                values = []
                # for t in range(0, len(chunks)):
                for n in chunks:

                    chunk = chunks[n]
                    value = chunk.getValueForLatLon(lat, lon)
                    lm = chunk.getLandmaskForLatLon(lat, lon)
                    if lm == 1.0 and value != 32767.0 and not masker.isLatLonMasked(
                            lat, lon):
                        values.append(value)

                if len(values) > 0:
                    avg = np.average(values)
                    min = np.min(values)
                    max = np.max(values)
                    std = np.std(values)
                    cnt = len(values)

                    xi = range(0, len(values))
                    slope, intercept, r_value, p_value, std_err = stats.linregress(
                        xi, values)

                else:
                    avg, min, max, std, cnt = (0, 0, 0, 0, 0)
                    slope, intercept, r_value, p_value, std_err = (0, 0, 0, 0,
                                                                   0)

                avg = 0.0 if not self._validNumber(float(avg)) else float(avg)
                min = 0.0 if not self._validNumber(float(min)) else float(min)
                max = 0.0 if not self._validNumber(float(max)) else float(max)
                std = 0.0 if not self._validNumber(float(std)) else float(std)
                cnt = 0.0 if not self._validNumber(float(cnt)) else float(cnt)
                slope = 0.0 if not self._validNumber(
                    float(slope)) else float(slope)
                intercept = 0.0 if not self._validNumber(
                    float(intercept)) else float(intercept)
                r_value = 0.0 if not self._validNumber(
                    float(r_value)) else float(r_value)
                p_value = 0.0 if not self._validNumber(
                    float(p_value)) else float(p_value)
                std_err = 0.0 if not self._validNumber(
                    float(std_err)) else float(std_err)

                a[y][x] = {
                    'avg': avg,
                    'min': min,
                    'max': max,
                    'std': std,
                    'cnt': cnt,
                    'slope': slope,
                    'intercept': intercept,
                    'r': r_value,
                    'p': p_value,
                    'stderr': std_err,
                    'lat': float(lat),
                    'lon': float(lon)
                }

                lon = lon + 1
                x = x + 1
            lat = lat + 1
            y = y + 1

        return LongitudeLatitudeMapResults(results=a,
                                           meta=meta,
                                           computeOptions=computeOptions)
コード例 #7
0
    def getTimeSeriesStatsForBoxSingleDataSet(self, bounding_polygon, ds, start_seconds_from_epoch,
                                              end_seconds_from_epoch,
                                              apply_seasonal_cycle_filter=True, apply_low_pass_filter=True):

        the_time = datetime.now()
        daysinrange = self._get_tile_service().find_days_in_range_asc(bounding_polygon.bounds[1],
                                                                bounding_polygon.bounds[3],
                                                                bounding_polygon.bounds[0],
                                                                bounding_polygon.bounds[2],
                                                                ds,
                                                                start_seconds_from_epoch,
                                                                end_seconds_from_epoch)
        logger.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        if len(daysinrange) == 0:
            raise NoDataException(reason="No data found for selected timeframe")

        the_time = datetime.now()
        maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses"))

        results = []
        if maxprocesses == 1:
            calculator = TimeSeriesCalculator()
            for dayinseconds in daysinrange:
                result = calculator.calc_average_on_day(bounding_polygon.wkt, ds, dayinseconds)
                results += [result] if result else []
        else:
            # Create a task to calc average difference for each day
            manager = Manager()
            work_queue = manager.Queue()
            done_queue = manager.Queue()
            for dayinseconds in daysinrange:
                work_queue.put(
                    ('calc_average_on_day', bounding_polygon.wkt, ds, dayinseconds))
            [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)]

            # Start new processes to handle the work
            pool = Pool(maxprocesses)
            [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in range(0, maxprocesses)]
            pool.close()

            # Collect the results as [(day (in ms), average difference for that day)]
            for i in range(0, len(daysinrange)):
                result = done_queue.get()
                try:
                    error_str = result['error']
                    logger.error(error_str)
                    raise NexusProcessingException(reason="Error calculating average by day.")
                except KeyError:
                    pass

                results += [result] if result else []

            pool.terminate()
            manager.shutdown()

        results = sorted(results, key=lambda entry: entry["time"])
        logger.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        if apply_seasonal_cycle_filter:
            the_time = datetime.now()
            for result in results:
                month = datetime.utcfromtimestamp(result['time']).month
                month_mean, month_max, month_min = self.calculate_monthly_average(month, bounding_polygon.wkt, ds)
                seasonal_mean = result['mean'] - month_mean
                seasonal_min = result['min'] - month_min
                seasonal_max = result['max'] - month_max
                result['meanSeasonal'] = seasonal_mean
                result['minSeasonal'] = seasonal_min
                result['maxSeasonal'] = seasonal_max
            logger.info(
                "Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        the_time = datetime.now()
        filtering.applyAllFiltersOnField(results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter)
        filtering.applyAllFiltersOnField(results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter)
        filtering.applyAllFiltersOnField(results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter)

        if apply_seasonal_cycle_filter and apply_low_pass_filter:
            try:
                filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
                filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
                filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True,
                                              append="LowPass")
            except Exception as e:
                # If it doesn't work log the error but ignore it
                tb = traceback.format_exc()
                logger.warn("Error calculating SeasonalLowPass filter:\n%s" % tb)

        logger.info(
            "LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds))

        return results, {}
コード例 #8
0
    def calc(self, computeOptions, **args):

        spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg(
        )
        self._setQueryParams(computeOptions.get_dataset(),
                             (float(computeOptions.get_min_lat()),
                              float(computeOptions.get_max_lat()),
                              float(computeOptions.get_min_lon()),
                              float(computeOptions.get_max_lon())),
                             computeOptions.get_start_time(),
                             computeOptions.get_end_time(),
                             spark_master=spark_master,
                             spark_nexecs=spark_nexecs,
                             spark_nparts=spark_nparts)

        self.log.debug('ds = {0}'.format(self._ds))
        if not len(self._ds) == 2:
            raise NexusProcessingException(
                reason=
                "Requires two datasets for comparison. Specify request parameter ds=Dataset_1,Dataset_2",
                code=400)
        if next(iter([clim for clim in self._ds if 'CLIM' in clim]), False):
            raise NexusProcessingException(
                reason="Cannot compute correlation on a climatology", code=400)

        nexus_tiles = self._find_global_tile_set()
        # print 'tiles:'
        # for tile in nexus_tiles:
        #     print tile.granule
        #     print tile.section_spec
        #     print 'lat:', tile.latitudes
        #     print 'lon:', tile.longitudes

        #                                                          nexus_tiles)
        if len(nexus_tiles) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} tiles'.format(len(nexus_tiles)))
        self.log.debug(
            'Using Native resolution: lat_res={0}, lon_res={1}'.format(
                self._latRes, self._lonRes))
        nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1
        nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1
        self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons))

        # Create array of tuples to pass to Spark map function
        nexus_tiles_spark = [[
            self._find_tile_bounds(t), self._startTime, self._endTime, self._ds
        ] for t in nexus_tiles]

        # Remove empty tiles (should have bounds set to None)
        bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0]
        for i in np.flipud(bad_tile_inds):
            del nexus_tiles_spark[i]

        # Expand Spark map tuple array by duplicating each entry N times,
        # where N is the number of ways we want the time dimension carved up.
        num_time_parts = 72
        # num_time_parts = 2
        # num_time_parts = 1
        nexus_tiles_spark = np.repeat(nexus_tiles_spark,
                                      num_time_parts,
                                      axis=0)
        self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format(
            len(nexus_tiles_spark)))

        # Set the time boundaries for each of the Spark map tuples.
        # Every Nth element in the array gets the same time bounds.
        spark_part_times = np.linspace(self._startTime,
                                       self._endTime + 1,
                                       num_time_parts + 1,
                                       dtype=np.int64)

        spark_part_time_ranges = \
            np.repeat([[[spark_part_times[i],
                         spark_part_times[i + 1] - 1] for i in range(num_time_parts)]],
                      len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2))
        self.log.debug(
            'spark_part_time_ranges={0}'.format(spark_part_time_ranges))
        nexus_tiles_spark[:, 1:3] = spark_part_time_ranges
        # print 'nexus_tiles_spark final = '
        # for i in range(len(nexus_tiles_spark)):
        #    print nexus_tiles_spark[i]

        # Launch Spark computations
        # print 'nexus_tiles_spark=',nexus_tiles_spark
        rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts)
        sum_tiles_part = rdd.map(self._map)
        # print "sum_tiles_part = ",sum_tiles_part.collect()
        sum_tiles = \
            sum_tiles_part.combineByKey(lambda val: val,
                                        lambda x, val: (x[0] + val[0],
                                                        x[1] + val[1],
                                                        x[2] + val[2],
                                                        x[3] + val[3],
                                                        x[4] + val[4],
                                                        x[5] + val[5]),
                                        lambda x, y: (x[0] + y[0],
                                                      x[1] + y[1],
                                                      x[2] + y[2],
                                                      x[3] + y[3],
                                                      x[4] + y[4],
                                                      x[5] + y[5]))
        # Convert the N (pixel-wise count) array for each tile to be a
        # NumPy masked array.  That is the last array in the tuple of
        # intermediate summation arrays.  Set mask to True if count is 0.
        sum_tiles = \
            sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx,
            sum_yy, sum_xy, n)):
                          (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy,
                                    np.ma.array(n,
                                                mask=~(n.astype(bool))))))

        # print 'sum_tiles = ',sum_tiles.collect()

        # For each pixel in each tile compute an array of Pearson
        # correlation coefficients.  The map function is called once
        # per tile.  The result of this map operation is a list of 3-tuples of
        # (bounds, r, n) for each tile (r=Pearson correlation coefficient
        # and n=number of input values that went into each pixel with
        # any masked values not included).
        corr_tiles = \
            sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx, sum_yy,
            sum_xy, n)):
                          (bounds,
                           np.ma.array(((sum_xy - sum_x * sum_y / n) /
                                        np.sqrt((sum_xx - sum_x * sum_x / n) *
                                                (sum_yy - sum_y * sum_y / n))),
                                       mask=~(n.astype(bool))),
                           n)).collect()

        r = np.zeros((nlats, nlons), dtype=np.float64, order='C')
        n = np.zeros((nlats, nlons), dtype=np.uint32, order='C')

        # The tiles below are NOT Nexus objects.  They are tuples
        # with the following for each correlation map subset:
        # (1) lat-lon bounding box, (2) array of correlation r values,
        # and (3) array of count n values.
        for tile in corr_tiles:
            ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon),
             tile_data, tile_cnt) = tile
            y0 = self._lat2ind(tile_min_lat)
            y1 = self._lat2ind(tile_max_lat)
            x0 = self._lon2ind(tile_min_lon)
            x1 = self._lon2ind(tile_max_lon)
            self.log.debug(
                'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon,
                        y0, y1, x0, x1))
            r[y0:y1 + 1, x0:x1 + 1] = tile_data
            n[y0:y1 + 1, x0:x1 + 1] = tile_cnt

        # Store global map in a NetCDF file.
        self._create_nc_file(r, 'corrmap.nc', 'r')

        # Create dict for JSON response
        results = [[{
            'r': r[y, x],
            'cnt': int(n[y, x]),
            'lat': self._ind2lat(y),
            'lon': self._ind2lon(x)
        } for x in range(r.shape[1])] for y in range(r.shape[0])]

        return CorrelationResults(results)
コード例 #9
0
ファイル: TimeAvgMapSpark.py プロジェクト: lfcma/nexus
    def calc(self, computeOptions, **args):
        """

        :param computeOptions: StatsComputeOptions
        :param args: dict
        :return:
        """

        spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg(
        )
        self._setQueryParams(computeOptions.get_dataset()[0],
                             (float(computeOptions.get_min_lat()),
                              float(computeOptions.get_max_lat()),
                              float(computeOptions.get_min_lon()),
                              float(computeOptions.get_max_lon())),
                             computeOptions.get_start_time(),
                             computeOptions.get_end_time(),
                             spark_master=spark_master,
                             spark_nexecs=spark_nexecs,
                             spark_nparts=spark_nparts)

        if 'CLIM' in self._ds:
            raise NexusProcessingException(
                reason=
                "Cannot compute Latitude/Longitude Time Average plot on a climatology",
                code=400)

        nexus_tiles = self._find_global_tile_set()
        # print 'tiles:'
        # for tile in nexus_tiles:
        #     print tile.granule
        #     print tile.section_spec
        #     print 'lat:', tile.latitudes
        #     print 'lon:', tile.longitudes

        #                                                          nexus_tiles)
        if len(nexus_tiles) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} tiles'.format(len(nexus_tiles)))

        self.log.debug(
            'Using Native resolution: lat_res={0}, lon_res={1}'.format(
                self._latRes, self._lonRes))
        nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1
        nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1
        self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons))
        self.log.debug('center lat range = {0} to {1}'.format(
            self._minLatCent, self._maxLatCent))
        self.log.debug('center lon range = {0} to {1}'.format(
            self._minLonCent, self._maxLonCent))

        # for tile in nexus_tiles:
        #    print 'lats: ', tile.latitudes.compressed()
        #    print 'lons: ', tile.longitudes.compressed()
        # Create array of tuples to pass to Spark map function
        nexus_tiles_spark = [[
            self._find_tile_bounds(t), self._startTime, self._endTime, self._ds
        ] for t in nexus_tiles]
        # print 'nexus_tiles_spark = ', nexus_tiles_spark
        # Remove empty tiles (should have bounds set to None)
        bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0]
        for i in np.flipud(bad_tile_inds):
            del nexus_tiles_spark[i]

        # Expand Spark map tuple array by duplicating each entry N times,
        # where N is the number of ways we want the time dimension carved up.
        num_time_parts = 72
        # num_time_parts = 1
        nexus_tiles_spark = np.repeat(nexus_tiles_spark,
                                      num_time_parts,
                                      axis=0)
        self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format(
            len(nexus_tiles_spark)))

        # Set the time boundaries for each of the Spark map tuples.
        # Every Nth element in the array gets the same time bounds.
        spark_part_times = np.linspace(self._startTime,
                                       self._endTime,
                                       num_time_parts + 1,
                                       dtype=np.int64)

        spark_part_time_ranges = \
            np.repeat([[[spark_part_times[i],
                         spark_part_times[i + 1]] for i in range(num_time_parts)]],
                      len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2))
        self.log.debug(
            'spark_part_time_ranges={0}'.format(spark_part_time_ranges))
        nexus_tiles_spark[:, 1:3] = spark_part_time_ranges
        # print 'nexus_tiles_spark final = '
        # for i in range(len(nexus_tiles_spark)):
        #    print nexus_tiles_spark[i]

        # Launch Spark computations
        rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts)
        sum_count_part = rdd.map(self._map)
        sum_count = \
            sum_count_part.combineByKey(lambda val: val,
                                        lambda x, val: (x[0] + val[0],
                                                        x[1] + val[1]),
                                        lambda x, y: (x[0] + y[0], x[1] + y[1]))
        fill = self._fill
        avg_tiles = \
            sum_count.map(lambda (bounds, (sum_tile, cnt_tile)):
                          (bounds, [[{'avg': (sum_tile[y, x] / cnt_tile[y, x])
                          if (cnt_tile[y, x] > 0)
                          else fill,
                                      'cnt': cnt_tile[y, x]}
                                     for x in
                                     range(sum_tile.shape[1])]
                                    for y in
                                    range(sum_tile.shape[0])])).collect()

        # Combine subset results to produce global map.
        #
        # The tiles below are NOT Nexus objects.  They are tuples
        # with the time avg map data and lat-lon bounding box.
        a = np.zeros((nlats, nlons), dtype=np.float64, order='C')
        n = np.zeros((nlats, nlons), dtype=np.uint32, order='C')
        for tile in avg_tiles:
            if tile is not None:
                ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon),
                 tile_stats) = tile
                tile_data = np.ma.array([[
                    tile_stats[y][x]['avg'] for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_cnt = np.array([[
                    tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_data.mask = ~(tile_cnt.astype(bool))
                y0 = self._lat2ind(tile_min_lat)
                y1 = y0 + tile_data.shape[0] - 1
                x0 = self._lon2ind(tile_min_lon)
                x1 = x0 + tile_data.shape[1] - 1
                if np.any(np.logical_not(tile_data.mask)):
                    self.log.debug(
                        'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))
                    a[y0:y1 + 1, x0:x1 + 1] = tile_data
                    n[y0:y1 + 1, x0:x1 + 1] = tile_cnt
                else:
                    self.log.debug(
                        'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))

        # Store global map in a NetCDF file.
        self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill)

        # Create dict for JSON response
        results = [[{
            'avg': a[y, x],
            'cnt': int(n[y, x]),
            'lat': self._ind2lat(y),
            'lon': self._ind2lon(x)
        } for x in range(a.shape[1])] for y in range(a.shape[0])]

        return TimeAvgMapSparkResults(results=results,
                                      meta={},
                                      computeOptions=computeOptions)
コード例 #10
0
    def calc(self, compute_options, **args):
        """

        :param compute_options: StatsComputeOptions
        :param args: dict
        :return:
        """
        request_start_time = datetime.now()

        metrics_record = self._create_metrics_record()

        ds, bbox, start_time, end_time, nparts_requested = self.parse_arguments(
            compute_options)
        self._setQueryParams(ds, (float(bbox.bounds[1]), float(
            bbox.bounds[3]), float(bbox.bounds[0]), float(bbox.bounds[2])),
                             start_time, end_time)

        nexus_tiles = self._find_global_tile_set(
            metrics_callback=metrics_record.record_metrics)

        if len(nexus_tiles) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} tiles'.format(len(nexus_tiles)))
        print('Found {} tiles'.format(len(nexus_tiles)))

        daysinrange = self._get_tile_service().find_days_in_range_asc(
            bbox.bounds[1],
            bbox.bounds[3],
            bbox.bounds[0],
            bbox.bounds[2],
            ds,
            start_time,
            end_time,
            metrics_callback=metrics_record.record_metrics)
        ndays = len(daysinrange)
        if ndays == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")
        self.log.debug('Found {0} days in range'.format(ndays))
        for i, d in enumerate(daysinrange):
            self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d)))

        self.log.debug(
            'Using Native resolution: lat_res={0}, lon_res={1}'.format(
                self._latRes, self._lonRes))
        self.log.debug('nlats={0}, nlons={1}'.format(self._nlats, self._nlons))
        self.log.debug('center lat range = {0} to {1}'.format(
            self._minLatCent, self._maxLatCent))
        self.log.debug('center lon range = {0} to {1}'.format(
            self._minLonCent, self._maxLonCent))

        # Create array of tuples to pass to Spark map function
        nexus_tiles_spark = [[
            self._find_tile_bounds(t), self._startTime, self._endTime, self._ds
        ] for t in nexus_tiles]

        # Remove empty tiles (should have bounds set to None)
        bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0]
        for i in np.flipud(bad_tile_inds):
            del nexus_tiles_spark[i]

        # Expand Spark map tuple array by duplicating each entry N times,
        # where N is the number of ways we want the time dimension carved up.
        # Set the time boundaries for each of the Spark map tuples so that
        # every Nth element in the array gets the same time bounds.
        max_time_parts = 72
        num_time_parts = min(max_time_parts, ndays)

        spark_part_time_ranges = np.tile(
            np.array([
                a[[0, -1]]
                for a in np.array_split(np.array(daysinrange), num_time_parts)
            ]), (len(nexus_tiles_spark), 1))
        nexus_tiles_spark = np.repeat(nexus_tiles_spark,
                                      num_time_parts,
                                      axis=0)
        nexus_tiles_spark[:, 1:3] = spark_part_time_ranges

        # Launch Spark computations
        spark_nparts = self._spark_nparts(nparts_requested)
        self.log.info('Using {} partitions'.format(spark_nparts))

        rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts)
        metrics_record.record_metrics(partitions=rdd.getNumPartitions())
        sum_count_part = rdd.map(
            partial(self._map, self._tile_service_factory,
                    metrics_record.record_metrics))
        reduce_duration = 0
        reduce_start = datetime.now()
        sum_count = sum_count_part.combineByKey(
            lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]),
            lambda x, y: (x[0] + y[0], x[1] + y[1]))
        reduce_duration += (datetime.now() - reduce_start).total_seconds()
        avg_tiles = sum_count.map(
            partial(calculate_means, metrics_record.record_metrics,
                    self._fill)).collect()

        reduce_start = datetime.now()
        # Combine subset results to produce global map.
        #
        # The tiles below are NOT Nexus objects.  They are tuples
        # with the time avg map data and lat-lon bounding box.
        a = np.zeros((self._nlats, self._nlons), dtype=np.float64, order='C')
        n = np.zeros((self._nlats, self._nlons), dtype=np.uint32, order='C')
        for tile in avg_tiles:
            if tile is not None:
                ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon),
                 tile_stats) = tile
                tile_data = np.ma.array([[
                    tile_stats[y][x]['avg'] for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_cnt = np.array([[
                    tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_data.mask = ~(tile_cnt.astype(bool))
                y0 = self._lat2ind(tile_min_lat)
                y1 = y0 + tile_data.shape[0] - 1
                x0 = self._lon2ind(tile_min_lon)
                x1 = x0 + tile_data.shape[1] - 1
                if np.any(np.logical_not(tile_data.mask)):
                    self.log.debug(
                        'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))
                    a[y0:y1 + 1, x0:x1 + 1] = tile_data
                    n[y0:y1 + 1, x0:x1 + 1] = tile_cnt
                else:
                    self.log.debug(
                        'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))

        # Store global map in a NetCDF file for debugging purpose
        # if activated this line is not thread safe and might cause error when concurrent access occurs
        # self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill)

        # Create dict for JSON response
        results = [[{
            'mean': a[y, x],
            'cnt': int(n[y, x]),
            'lat': self._ind2lat(y),
            'lon': self._ind2lon(x)
        } for x in range(a.shape[1])] for y in range(a.shape[0])]

        total_duration = (datetime.now() - request_start_time).total_seconds()
        metrics_record.record_metrics(actual_time=total_duration,
                                      reduce=reduce_duration)
        metrics_record.print_metrics(self.log)

        return NexusResults(results=results,
                            meta={},
                            stats=None,
                            computeOptions=None,
                            minLat=bbox.bounds[1],
                            maxLat=bbox.bounds[3],
                            minLon=bbox.bounds[0],
                            maxLon=bbox.bounds[2],
                            ds=ds,
                            startTime=start_time,
                            endTime=end_time)
コード例 #11
0
    def calc(self, request, **args):
        """

        :param request: StatsComputeOptions
        :param args: dict
        :return:
        """
        start_time = datetime.now()
        ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested, normalize_dates = self.parse_arguments(
            request)
        metrics_record = self._create_metrics_record()

        resultsRaw = []

        for shortName in ds:

            the_time = datetime.now()
            daysinrange = self._get_tile_service().find_days_in_range_asc(
                bounding_polygon.bounds[1],
                bounding_polygon.bounds[3],
                bounding_polygon.bounds[0],
                bounding_polygon.bounds[2],
                shortName,
                start_seconds_from_epoch,
                end_seconds_from_epoch,
                metrics_callback=metrics_record.record_metrics)
            self.log.info("Finding days in range took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            ndays = len(daysinrange)
            if ndays == 0:
                raise NoDataException(
                    reason="No data found for selected timeframe")

            self.log.debug('Found {0} days in range'.format(ndays))
            for i, d in enumerate(daysinrange):
                self.log.debug('{0}, {1}'.format(i,
                                                 datetime.utcfromtimestamp(d)))
            spark_nparts = self._spark_nparts(nparts_requested)
            self.log.info('Using {} partitions'.format(spark_nparts))
            results, meta = spark_driver(daysinrange,
                                         bounding_polygon,
                                         shortName,
                                         self._tile_service_factory,
                                         metrics_record.record_metrics,
                                         normalize_dates,
                                         spark_nparts=spark_nparts,
                                         sc=self._sc)

            if apply_seasonal_cycle_filter:
                the_time = datetime.now()
                # get time series for _clim dataset
                shortName_clim = shortName + "_clim"
                daysinrange_clim = self._get_tile_service(
                ).find_days_in_range_asc(
                    bounding_polygon.bounds[1],
                    bounding_polygon.bounds[3],
                    bounding_polygon.bounds[0],
                    bounding_polygon.bounds[2],
                    shortName_clim,
                    0,
                    SECONDS_IN_ONE_YEAR,
                    metrics_callback=metrics_record.record_metrics)
                if len(daysinrange_clim) == 0:
                    raise NexusProcessingException(
                        reason=
                        "There is no climatology data present for dataset " +
                        shortName + ".")
                results_clim, _ = spark_driver(daysinrange_clim,
                                               bounding_polygon,
                                               shortName_clim,
                                               self._tile_service_factory,
                                               metrics_record.record_metrics,
                                               normalize_dates=False,
                                               spark_nparts=spark_nparts,
                                               sc=self._sc)
                clim_indexed_by_month = {
                    datetime.utcfromtimestamp(result['time']).month: result
                    for result in results_clim
                }
                if len(clim_indexed_by_month) < 12:
                    raise NexusProcessingException(
                        reason="There are only " + len(clim_indexed_by_month) +
                        " months of climatology data for dataset " +
                        shortName +
                        ". A full year of climatology data is required for computing deseasoned timeseries."
                    )

                for result in results:
                    month = datetime.utcfromtimestamp(result['time']).month

                    result['meanSeasonal'] = result[
                        'mean'] - clim_indexed_by_month[month]['mean']
                    result['minSeasonal'] = result[
                        'min'] - clim_indexed_by_month[month]['min']
                    result['maxSeasonal'] = result[
                        'max'] - clim_indexed_by_month[month]['max']
                self.log.info("Seasonal calculation took %s for dataset %s" %
                              (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            filtering.applyAllFiltersOnField(
                results,
                'mean',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'max',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'min',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)

            if apply_seasonal_cycle_filter and apply_low_pass_filter:
                try:
                    filtering.applyFiltersOnField(results,
                                                  'meanSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'minSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'maxSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                except Exception as e:
                    # If it doesn't work log the error but ignore it
                    tb = traceback.format_exc()
                    self.log.warn(
                        "Error calculating SeasonalLowPass filter:\n%s" % tb)

            resultsRaw.append([results, meta])
            self.log.info("LowPass filter calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            self._create_nc_file_time1d(np.array(results),
                                        'ts.nc',
                                        'mean',
                                        fill=-9999.)
            self.log.info("NetCDF generation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

        the_time = datetime.now()
        results = self._mergeResults(resultsRaw)

        if len(ds) == 2:
            try:
                stats = TimeSeriesSparkHandlerImpl.calculate_comparison_stats(
                    results)
            except Exception:
                stats = {}
                tb = traceback.format_exc()
                self.log.warn("Error when calculating comparison stats:\n%s" %
                              tb)
        else:
            stats = {}

        meta = []
        for singleRes in resultsRaw:
            meta.append(singleRes[1])

        res = TimeSeriesResults(results=results,
                                meta=meta,
                                stats=stats,
                                computeOptions=None,
                                minLat=bounding_polygon.bounds[1],
                                maxLat=bounding_polygon.bounds[3],
                                minLon=bounding_polygon.bounds[0],
                                maxLon=bounding_polygon.bounds[2],
                                ds=ds,
                                startTime=start_seconds_from_epoch,
                                endTime=end_seconds_from_epoch)

        total_duration = (datetime.now() - start_time).total_seconds()
        metrics_record.record_metrics(actual_time=total_duration)
        metrics_record.print_metrics(logger)

        self.log.info("Merging results and calculating comparisons took %s" %
                      (str(datetime.now() - the_time)))
        return res
コード例 #12
0
    def calc(self, compute_options, **args):
        """

        :param compute_options: StatsComputeOptions
        :param args: dict
        :return:
        """

        ds, bbox, start_time, end_time, nparts_requested = self.parse_arguments(
            compute_options)
        self._setQueryParams(ds, (float(bbox.bounds[1]), float(
            bbox.bounds[3]), float(bbox.bounds[0]), float(bbox.bounds[2])),
                             start_time, end_time)

        nexus_tiles = self._find_global_tile_set()

        if len(nexus_tiles) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        self.log.debug('Found {0} tiles'.format(len(nexus_tiles)))
        print('Found {} tiles'.format(len(nexus_tiles)))

        daysinrange = self._tile_service.find_days_in_range_asc(
            bbox.bounds[1], bbox.bounds[3], bbox.bounds[0], bbox.bounds[2], ds,
            start_time, end_time)
        ndays = len(daysinrange)
        if ndays == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")
        self.log.debug('Found {0} days in range'.format(ndays))
        for i, d in enumerate(daysinrange):
            self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d)))

        self.log.debug(
            'Using Native resolution: lat_res={0}, lon_res={1}'.format(
                self._latRes, self._lonRes))
        self.log.debug('nlats={0}, nlons={1}'.format(self._nlats, self._nlons))
        self.log.debug('center lat range = {0} to {1}'.format(
            self._minLatCent, self._maxLatCent))
        self.log.debug('center lon range = {0} to {1}'.format(
            self._minLonCent, self._maxLonCent))

        # Create array of tuples to pass to Spark map function
        nexus_tiles_spark = [[
            self._find_tile_bounds(t), self._startTime, self._endTime, self._ds
        ] for t in nexus_tiles]

        # Remove empty tiles (should have bounds set to None)
        bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0]
        for i in np.flipud(bad_tile_inds):
            del nexus_tiles_spark[i]

        # Expand Spark map tuple array by duplicating each entry N times,
        # where N is the number of ways we want the time dimension carved up.
        # Set the time boundaries for each of the Spark map tuples so that
        # every Nth element in the array gets the same time bounds.
        max_time_parts = 72
        num_time_parts = min(max_time_parts, ndays)

        spark_part_time_ranges = np.tile(
            np.array([
                a[[0, -1]]
                for a in np.array_split(np.array(daysinrange), num_time_parts)
            ]), (len(nexus_tiles_spark), 1))
        nexus_tiles_spark = np.repeat(nexus_tiles_spark,
                                      num_time_parts,
                                      axis=0)
        nexus_tiles_spark[:, 1:3] = spark_part_time_ranges

        # Launch Spark computations to calculate x_bar
        spark_nparts = self._spark_nparts(nparts_requested)
        self.log.info('Using {} partitions'.format(spark_nparts))

        rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts)
        sum_count_part = rdd.map(self._map)
        sum_count = \
            sum_count_part.combineByKey(lambda val: val,
                                        lambda x, val: (x[0] + val[0],
                                                        x[1] + val[1]),
                                        lambda x, y: (x[0] + y[0], x[1] + y[1]))
        fill = self._fill
        avg_tiles = \
            sum_count.map(lambda (bounds, (sum_tile, cnt_tile)):
                          (bounds, [[(sum_tile[y, x] / cnt_tile[y, x])
                          if (cnt_tile[y, x] > 0)
                          else fill
                                     for x in
                                     range(sum_tile.shape[1])]
                                    for y in
                                    range(sum_tile.shape[0])])).collect()

        #
        # Launch a second parallel computation to calculate variance from x_bar
        #

        # Create array of tuples to pass to Spark map function - first param are the tile bounds that were in the
        # results and the last param is the data for the results (x bar)
        nexus_tiles_spark = [[
            t[0], self._startTime, self._endTime, self._ds, t[1]
        ] for t in avg_tiles]

        self.log.info('Using {} partitions'.format(spark_nparts))
        rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts)

        anomaly_squared_part = rdd.map(self._calc_variance)
        anomaly_squared = \
            anomaly_squared_part.combineByKey(lambda val: val,
                                        lambda x, val: (x[0] + val[0],
                                                        x[1] + val[1]),
                                        lambda x, y: (x[0] + y[0], x[1] + y[1]))

        variance_tiles = \
            anomaly_squared.map(lambda (bounds, (anomaly_squared_tile, cnt_tile)):
                               (bounds, [[{'variance': (anomaly_squared_tile[y, x] / cnt_tile[y, x])
                               if (cnt_tile[y, x] > 0)
                               else fill,
                                           'cnt': cnt_tile[y, x]}
                                       for x in range(anomaly_squared_tile.shape[1])]
                                       for y in range(anomaly_squared_tile.shape[0])])).collect()

        # Combine subset results to produce global map.
        #
        # The tiles below are NOT Nexus objects.  They are tuples
        # with the time avg map data and lat-lon bounding box.
        a = np.zeros((self._nlats, self._nlons), dtype=np.float64, order='C')
        n = np.zeros((self._nlats, self._nlons), dtype=np.uint32, order='C')
        for tile in variance_tiles:
            if tile is not None:
                ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon),
                 tile_stats) = tile
                tile_data = np.ma.array([[
                    tile_stats[y][x]['variance']
                    for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_cnt = np.array([[
                    tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0]))
                ] for y in range(len(tile_stats))])
                tile_data.mask = ~(tile_cnt.astype(bool))
                y0 = self._lat2ind(tile_min_lat)
                y1 = y0 + tile_data.shape[0] - 1
                x0 = self._lon2ind(tile_min_lon)
                x1 = x0 + tile_data.shape[1] - 1
                if np.any(np.logical_not(tile_data.mask)):
                    self.log.debug(
                        'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))
                    a[y0:y1 + 1, x0:x1 + 1] = tile_data
                    n[y0:y1 + 1, x0:x1 + 1] = tile_cnt
                else:
                    self.log.debug(
                        'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}'
                        .format(tile_min_lat, tile_max_lat, tile_min_lon,
                                tile_max_lon, y0, y1, x0, x1))

        # Store global map in a NetCDF file.
        self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill)

        # Create dict for JSON response
        results = [[{
            'variance': a[y, x],
            'cnt': int(n[y, x]),
            'lat': self._ind2lat(y),
            'lon': self._ind2lon(x)
        } for x in range(a.shape[1])] for y in range(a.shape[0])]

        return NexusResults(results=results,
                            meta={},
                            stats=None,
                            computeOptions=None,
                            minLat=bbox.bounds[1],
                            maxLat=bbox.bounds[3],
                            minLon=bbox.bounds[0],
                            maxLon=bbox.bounds[2],
                            ds=ds,
                            startTime=start_time,
                            endTime=end_time)
コード例 #13
0
    def getTimeSeriesStatsForBoxSingleDataSet(self,
                                              min_lat,
                                              max_lat,
                                              min_lon,
                                              max_lon,
                                              ds,
                                              start_time=0,
                                              end_time=-1,
                                              applySeasonalFilter=False,
                                              applyLowPass=False):

        daysinrange = self._tile_service.find_days_in_range_asc(
            min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time)

        if len(daysinrange) == 0:
            raise NoDataException(
                reason="No data found for selected timeframe")

        print 'Found %d days in range' % len(daysinrange)

        cwd = os.getcwd()

        # Configure Spark
        sp_conf = SparkConf()
        sp_conf.setAppName("Spark Time Avg Map")
        sp_conf.set("spark.executorEnv.HOME",
                    os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        sp_conf.set("spark.executorEnv.PYTHONPATH", cwd)
        #sp_conf.set("spark.yarn.executor.memoryOverhead", "4000")
        sp_conf.set("spark.executor.memory", "4g")

        #num_parts = 1
        #num_parts = 16
        #num_parts = 32
        #num_parts = 64
        num_parts = 128
        #num_execs = 1
        #num_execs = 16
        #num_execs = 32
        num_execs = 64
        cores_per_exec = 1
        sp_conf.setMaster("yarn-client")
        #sp_conf.setMaster("local[16]")
        #sp_conf.setMaster("local[1]")
        sp_conf.set("spark.executor.instances", num_execs)
        sp_conf.set("spark.executor.cores", cores_per_exec)

        #print sp_conf.getAll()
        sc = SparkContext(conf=sp_conf)

        nexus_tiles_spark = [
            (min_lat, max_lat, min_lon, max_lon, ds, list(daysinrange_part),
             cwd)
            for daysinrange_part in np.array_split(daysinrange, num_parts)
        ]

        #for tile in nexus_tiles_spark:
        #    print tile

        # Launch Spark computations
        rdd = sc.parallelize(nexus_tiles_spark, num_parts)
        results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect()
        #
        results = list(itertools.chain.from_iterable(results))
        results = sorted(results, key=lambda entry: entry["time"])

        #filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        #filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)
        #filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass)

        self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean')
        return results, {}
コード例 #14
0
    def calc(self, request, **args):
        """
    
        :param request: StatsComputeOptions
        :param args: dict
        :return:
        """

        ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested = self.parse_arguments(
            request)

        resultsRaw = []

        for shortName in ds:

            the_time = datetime.now()
            daysinrange = self._tile_service.find_days_in_range_asc(
                bounding_polygon.bounds[1], bounding_polygon.bounds[3],
                bounding_polygon.bounds[0], bounding_polygon.bounds[2],
                shortName, start_seconds_from_epoch, end_seconds_from_epoch)
            self.log.info("Finding days in range took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            ndays = len(daysinrange)
            if ndays == 0:
                raise NoDataException(
                    reason="No data found for selected timeframe")

            self.log.debug('Found {0} days in range'.format(ndays))
            for i, d in enumerate(daysinrange):
                self.log.debug('{0}, {1}'.format(i,
                                                 datetime.utcfromtimestamp(d)))
            spark_nparts = self._spark_nparts(nparts_requested)
            self.log.info('Using {} partitions'.format(spark_nparts))
            the_time = datetime.now()
            results, meta = spark_driver(daysinrange,
                                         bounding_polygon,
                                         shortName,
                                         spark_nparts=spark_nparts,
                                         sc=self._sc)
            self.log.info("Time series calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            if apply_seasonal_cycle_filter:
                the_time = datetime.now()
                for result in results:
                    month = datetime.utcfromtimestamp(result['time']).month
                    month_mean, month_max, month_min = self.calculate_monthly_average(
                        month, bounding_polygon.wkt, shortName)
                    seasonal_mean = result['mean'] - month_mean
                    seasonal_min = result['min'] - month_min
                    seasonal_max = result['max'] - month_max
                    result['meanSeasonal'] = seasonal_mean
                    result['minSeasonal'] = seasonal_min
                    result['maxSeasonal'] = seasonal_max
                self.log.info("Seasonal calculation took %s for dataset %s" %
                              (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            filtering.applyAllFiltersOnField(
                results,
                'mean',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'max',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)
            filtering.applyAllFiltersOnField(
                results,
                'min',
                applySeasonal=False,
                applyLowPass=apply_low_pass_filter)

            if apply_seasonal_cycle_filter and apply_low_pass_filter:
                try:
                    filtering.applyFiltersOnField(results,
                                                  'meanSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'minSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                    filtering.applyFiltersOnField(results,
                                                  'maxSeasonal',
                                                  applySeasonal=False,
                                                  applyLowPass=True,
                                                  append="LowPass")
                except Exception as e:
                    # If it doesn't work log the error but ignore it
                    tb = traceback.format_exc()
                    self.log.warn(
                        "Error calculating SeasonalLowPass filter:\n%s" % tb)

            resultsRaw.append([results, meta])
            self.log.info("LowPass filter calculation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

            the_time = datetime.now()
            self._create_nc_file_time1d(np.array(results),
                                        'ts.nc',
                                        'mean',
                                        fill=-9999.)
            self.log.info("NetCDF generation took %s for dataset %s" %
                          (str(datetime.now() - the_time), shortName))

        the_time = datetime.now()
        results = self._mergeResults(resultsRaw)

        if len(ds) == 2:
            try:
                stats = TimeSeriesHandlerImpl.calculate_comparison_stats(
                    results)
            except Exception:
                stats = {}
                tb = traceback.format_exc()
                self.log.warn("Error when calculating comparison stats:\n%s" %
                              tb)
        else:
            stats = {}

        meta = []
        for singleRes in resultsRaw:
            meta.append(singleRes[1])

        res = TimeSeriesResults(results=results,
                                meta=meta,
                                stats=stats,
                                computeOptions=None,
                                minLat=bounding_polygon.bounds[1],
                                maxLat=bounding_polygon.bounds[3],
                                minLon=bounding_polygon.bounds[0],
                                maxLon=bounding_polygon.bounds[2],
                                ds=ds,
                                startTime=start_seconds_from_epoch,
                                endTime=end_seconds_from_epoch)

        self.log.info("Merging results and calculating comparisons took %s" %
                      (str(datetime.now() - the_time)))
        return res