def calc(self, compute_options, **args): ds, bbox, start_time, end_time = self.parse_arguments(compute_options) min_lon, min_lat, max_lon, max_lat = bbox.bounds nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in enumerate(self._tile_service.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, fetch_data=False))] print ("Got {} tiles".format(len(nexus_tiles_spark))) if len(nexus_tiles_spark) == 0: raise NoDataException(reason="No data found for selected timeframe") results = spark_driver(self._sc, self._latlon, nexus_tiles_spark) results = filter(None, results) results = sorted(results, key=lambda entry: entry["time"]) for i in range(len(results)): results[i]['lons'] = sorted(results[i]['lons'], key=lambda entry: entry['longitude']) # Deseason disabled. See SDAP-148 # results = self.applyDeseasonToHofMoeller(results, pivot="lons") result = HoffMoellerResults(results=results, compute_options=None, type=HoffMoellerResults.LONGITUDE, minLat=min_lat, maxLat=max_lat, minLon=min_lon, maxLon=max_lon, ds=ds, startTime=start_time, endTime=end_time) return result
def calc(self, compute_options, **args): ds, bbox, start_time, end_time, normalize_dates = self.parse_arguments( compute_options) metrics_record = self._create_metrics_record() calculation_start = datetime.now() min_lon, min_lat, max_lon, max_lat = bbox.bounds nexus_tiles_spark = [ (self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in enumerate(self._get_tile_service( ).find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, metrics_callback=metrics_record.record_metrics, fetch_data=False)) ] print(("Got {} tiles".format(len(nexus_tiles_spark)))) if len(nexus_tiles_spark) == 0: raise NoDataException( reason="No data found for selected timeframe") results = spark_driver(self._sc, self._latlon, self._tile_service_factory, nexus_tiles_spark, metrics_record.record_metrics, normalize_dates) results = [_f for _f in results if _f] results = sorted(results, key=lambda entry: entry["time"]) for i in range(len(results)): results[i]['lons'] = sorted(results[i]['lons'], key=lambda entry: entry['longitude']) # Deseason disabled. See SDAP-148 # results = self.applyDeseasonToHofMoeller(results, pivot="lons") result = HoffMoellerResults(results=results, compute_options=None, type=HoffMoellerResults.LONGITUDE, minLat=min_lat, maxLat=max_lat, minLon=min_lon, maxLon=max_lon, ds=ds, startTime=start_time, endTime=end_time) duration = (datetime.now() - calculation_start).total_seconds() metrics_record.record_metrics(actual_time=duration) metrics_record.print_metrics(self.log) return result
def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=True, applyLowPass=True, fill=-9999., spark_master="local[1]", spark_nexecs=1, spark_nparts=1): daysinrange = self._tile_service.find_days_in_range_asc( min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts_needed = min(spark_nparts, ndays) nexus_tiles_spark = [(min_lat, max_lat, min_lon, max_lon, ds, list(daysinrange_part), fill) for daysinrange_part in np.array_split( daysinrange, spark_nparts_needed)] # Launch Spark computations rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts_needed) results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect() # results = list(itertools.chain.from_iterable(results)) results = sorted(results, key=lambda entry: entry["time"]) filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) return results, {}
def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=True, applyLowPass=True): daysinrange = self._tile_service.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(min_lat, max_lat, min_lon, max_lon, ds, dayinseconds) results.append(result) else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', min_lat, max_lat, min_lon, max_lon, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in xrange(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] self.log.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results.append(result) pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) return results, {}
def calc(self, computeOptions, **args): """ :param computeOptions: StatsComputeOptions :param args: dict :return: """ self._minLat = float(computeOptions.get_min_lat()) self._maxLat = float(computeOptions.get_max_lat()) self._minLon = float(computeOptions.get_min_lon()) self._maxLon = float(computeOptions.get_max_lon()) self._ds = computeOptions.get_dataset()[0] self._startTime = computeOptions.get_start_time() self._endTime = computeOptions.get_end_time() self._find_native_resolution() print 'Using Native resolution: lat_res=%f, lon_res=%f' % ( self._latRes, self._lonRes) self._minLatCent = self._minLat + self._latRes / 2 self._minLonCent = self._minLon + self._lonRes / 2 nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self._maxLatCent = self._minLatCent + (nlats - 1) * self._latRes self._maxLonCent = self._minLonCent + (nlons - 1) * self._lonRes print 'nlats=', nlats, 'nlons=', nlons print 'center lat range = %f to %f' % (self._minLatCent, self._maxLatCent) print 'center lon range = %f to %f' % (self._minLonCent, self._maxLonCent) sys.stdout.flush() a = np.zeros((nlats, nlons), dtype=np.float64, order='C') nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") print 'Initially found %d tiles' % len(nexus_tiles) sys.stdout.flush() self._prune_tiles(nexus_tiles) print 'Pruned to %d tiles' % len(nexus_tiles) sys.stdout.flush() #for tile in nexus_tiles: # print 'lats: ', tile.latitudes.compressed() # print 'lons: ', tile.longitudes.compressed() avg_tiles = map(self._map, nexus_tiles) print 'shape a = ', a.shape sys.stdout.flush() # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. for tile in avg_tiles: if tile is not None: (tile_data, tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon) = tile print 'shape tile_data = ', tile_data.shape print 'tile data mask = ', tile_data.mask sys.stdout.flush() if np.any(np.logical_not(tile_data.mask)): y0 = self._lat2ind(tile_min_lat) y1 = self._lat2ind(tile_max_lat) x0 = self._lon2ind(tile_min_lon) x1 = self._lon2ind(tile_max_lon) print 'writing tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() a[y0:y1 + 1, x0:x1 + 1] = tile_data else: print 'All pixels masked in tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() self._create_nc_file(a) return TimeAvgMapResults(results={}, meta={}, computeOptions=computeOptions)
def calc(self, computeOptions, **args): minLat = computeOptions.get_min_lat() maxLat = computeOptions.get_max_lat() minLon = computeOptions.get_min_lon() maxLon = computeOptions.get_max_lon() ds = computeOptions.get_dataset()[0] startTime = computeOptions.get_start_time() endTime = computeOptions.get_end_time() maskLimitType = computeOptions.get_mask_type() chunks, meta = self.getChunksForBox(minLat, maxLat, minLon, maxLon, ds, startTime=startTime, endTime=endTime) if len(chunks) == 0: raise NoDataException( reason="No data found for selected timeframe") masker = LandMaskChecker(self._landmask, maskLimitType) a = self._allocateArray(int(math.ceil(maxLat - minLat)), int(math.ceil(maxLon - minLon))) lat = minLat y = 0 x = 0 while lat < maxLat: lon = minLon x = 0 while lon < maxLon: values = [] # for t in range(0, len(chunks)): for n in chunks: chunk = chunks[n] value = chunk.getValueForLatLon(lat, lon) lm = chunk.getLandmaskForLatLon(lat, lon) if lm == 1.0 and value != 32767.0 and not masker.isLatLonMasked( lat, lon): values.append(value) if len(values) > 0: avg = np.average(values) min = np.min(values) max = np.max(values) std = np.std(values) cnt = len(values) xi = range(0, len(values)) slope, intercept, r_value, p_value, std_err = stats.linregress( xi, values) else: avg, min, max, std, cnt = (0, 0, 0, 0, 0) slope, intercept, r_value, p_value, std_err = (0, 0, 0, 0, 0) avg = 0.0 if not self._validNumber(float(avg)) else float(avg) min = 0.0 if not self._validNumber(float(min)) else float(min) max = 0.0 if not self._validNumber(float(max)) else float(max) std = 0.0 if not self._validNumber(float(std)) else float(std) cnt = 0.0 if not self._validNumber(float(cnt)) else float(cnt) slope = 0.0 if not self._validNumber( float(slope)) else float(slope) intercept = 0.0 if not self._validNumber( float(intercept)) else float(intercept) r_value = 0.0 if not self._validNumber( float(r_value)) else float(r_value) p_value = 0.0 if not self._validNumber( float(p_value)) else float(p_value) std_err = 0.0 if not self._validNumber( float(std_err)) else float(std_err) a[y][x] = { 'avg': avg, 'min': min, 'max': max, 'std': std, 'cnt': cnt, 'slope': slope, 'intercept': intercept, 'r': r_value, 'p': p_value, 'stderr': std_err, 'lat': float(lat), 'lon': float(lon) } lon = lon + 1 x = x + 1 lat = lat + 1 y = y + 1 return LongitudeLatitudeMapResults(results=a, meta=meta, computeOptions=computeOptions)
def getTimeSeriesStatsForBoxSingleDataSet(self, bounding_polygon, ds, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter=True, apply_low_pass_filter=True): the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc(bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], ds, start_seconds_from_epoch, end_seconds_from_epoch) logger.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") the_time = datetime.now() maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(bounding_polygon.wkt, ds, dayinseconds) results += [result] if result else [] else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', bounding_polygon.wkt, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in range(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in range(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] logger.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results += [result] if result else [] pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) logger.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if apply_seasonal_cycle_filter: the_time = datetime.now() for result in results: month = datetime.utcfromtimestamp(result['time']).month month_mean, month_max, month_min = self.calculate_monthly_average(month, bounding_polygon.wkt, ds) seasonal_mean = result['mean'] - month_mean seasonal_min = result['min'] - month_min seasonal_max = result['max'] - month_max result['meanSeasonal'] = seasonal_mean result['minSeasonal'] = seasonal_min result['maxSeasonal'] = seasonal_max logger.info( "Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) the_time = datetime.now() filtering.applyAllFiltersOnField(results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() logger.warn("Error calculating SeasonalLowPass filter:\n%s" % tb) logger.info( "LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) return results, {}
def calc(self, computeOptions, **args): spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg( ) self._setQueryParams(computeOptions.get_dataset(), (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time(), spark_master=spark_master, spark_nexecs=spark_nexecs, spark_nparts=spark_nparts) self.log.debug('ds = {0}'.format(self._ds)) if not len(self._ds) == 2: raise NexusProcessingException( reason= "Requires two datasets for comparison. Specify request parameter ds=Dataset_1,Dataset_2", code=400) if next(iter([clim for clim in self._ds if 'CLIM' in clim]), False): raise NexusProcessingException( reason="Cannot compute correlation on a climatology", code=400) nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons)) # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. num_time_parts = 72 # num_time_parts = 2 # num_time_parts = 1 nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format( len(nexus_tiles_spark))) # Set the time boundaries for each of the Spark map tuples. # Every Nth element in the array gets the same time bounds. spark_part_times = np.linspace(self._startTime, self._endTime + 1, num_time_parts + 1, dtype=np.int64) spark_part_time_ranges = \ np.repeat([[[spark_part_times[i], spark_part_times[i + 1] - 1] for i in range(num_time_parts)]], len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2)) self.log.debug( 'spark_part_time_ranges={0}'.format(spark_part_time_ranges)) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # print 'nexus_tiles_spark final = ' # for i in range(len(nexus_tiles_spark)): # print nexus_tiles_spark[i] # Launch Spark computations # print 'nexus_tiles_spark=',nexus_tiles_spark rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts) sum_tiles_part = rdd.map(self._map) # print "sum_tiles_part = ",sum_tiles_part.collect() sum_tiles = \ sum_tiles_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1], x[2] + val[2], x[3] + val[3], x[4] + val[4], x[5] + val[5]), lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3], x[4] + y[4], x[5] + y[5])) # Convert the N (pixel-wise count) array for each tile to be a # NumPy masked array. That is the last array in the tuple of # intermediate summation arrays. Set mask to True if count is 0. sum_tiles = \ sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, n)): (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, np.ma.array(n, mask=~(n.astype(bool)))))) # print 'sum_tiles = ',sum_tiles.collect() # For each pixel in each tile compute an array of Pearson # correlation coefficients. The map function is called once # per tile. The result of this map operation is a list of 3-tuples of # (bounds, r, n) for each tile (r=Pearson correlation coefficient # and n=number of input values that went into each pixel with # any masked values not included). corr_tiles = \ sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, n)): (bounds, np.ma.array(((sum_xy - sum_x * sum_y / n) / np.sqrt((sum_xx - sum_x * sum_x / n) * (sum_yy - sum_y * sum_y / n))), mask=~(n.astype(bool))), n)).collect() r = np.zeros((nlats, nlons), dtype=np.float64, order='C') n = np.zeros((nlats, nlons), dtype=np.uint32, order='C') # The tiles below are NOT Nexus objects. They are tuples # with the following for each correlation map subset: # (1) lat-lon bounding box, (2) array of correlation r values, # and (3) array of count n values. for tile in corr_tiles: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_data, tile_cnt) = tile y0 = self._lat2ind(tile_min_lat) y1 = self._lat2ind(tile_max_lat) x0 = self._lon2ind(tile_min_lon) x1 = self._lon2ind(tile_max_lon) self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) r[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt # Store global map in a NetCDF file. self._create_nc_file(r, 'corrmap.nc', 'r') # Create dict for JSON response results = [[{ 'r': r[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(r.shape[1])] for y in range(r.shape[0])] return CorrelationResults(results)
def calc(self, computeOptions, **args): """ :param computeOptions: StatsComputeOptions :param args: dict :return: """ spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg( ) self._setQueryParams(computeOptions.get_dataset()[0], (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time(), spark_master=spark_master, spark_nexecs=spark_nexecs, spark_nparts=spark_nparts) if 'CLIM' in self._ds: raise NexusProcessingException( reason= "Cannot compute Latitude/Longitude Time Average plot on a climatology", code=400) nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons)) self.log.debug('center lat range = {0} to {1}'.format( self._minLatCent, self._maxLatCent)) self.log.debug('center lon range = {0} to {1}'.format( self._minLonCent, self._maxLonCent)) # for tile in nexus_tiles: # print 'lats: ', tile.latitudes.compressed() # print 'lons: ', tile.longitudes.compressed() # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # print 'nexus_tiles_spark = ', nexus_tiles_spark # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. num_time_parts = 72 # num_time_parts = 1 nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format( len(nexus_tiles_spark))) # Set the time boundaries for each of the Spark map tuples. # Every Nth element in the array gets the same time bounds. spark_part_times = np.linspace(self._startTime, self._endTime, num_time_parts + 1, dtype=np.int64) spark_part_time_ranges = \ np.repeat([[[spark_part_times[i], spark_part_times[i + 1]] for i in range(num_time_parts)]], len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2)) self.log.debug( 'spark_part_time_ranges={0}'.format(spark_part_time_ranges)) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # print 'nexus_tiles_spark final = ' # for i in range(len(nexus_tiles_spark)): # print nexus_tiles_spark[i] # Launch Spark computations rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts) sum_count_part = rdd.map(self._map) sum_count = \ sum_count_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]), lambda x, y: (x[0] + y[0], x[1] + y[1])) fill = self._fill avg_tiles = \ sum_count.map(lambda (bounds, (sum_tile, cnt_tile)): (bounds, [[{'avg': (sum_tile[y, x] / cnt_tile[y, x]) if (cnt_tile[y, x] > 0) else fill, 'cnt': cnt_tile[y, x]} for x in range(sum_tile.shape[1])] for y in range(sum_tile.shape[0])])).collect() # Combine subset results to produce global map. # # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. a = np.zeros((nlats, nlons), dtype=np.float64, order='C') n = np.zeros((nlats, nlons), dtype=np.uint32, order='C') for tile in avg_tiles: if tile is not None: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_stats) = tile tile_data = np.ma.array([[ tile_stats[y][x]['avg'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_cnt = np.array([[ tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = y0 + tile_data.shape[0] - 1 x0 = self._lon2ind(tile_min_lon) x1 = x0 + tile_data.shape[1] - 1 if np.any(np.logical_not(tile_data.mask)): self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) a[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt else: self.log.debug( 'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) # Store global map in a NetCDF file. self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill) # Create dict for JSON response results = [[{ 'avg': a[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(a.shape[1])] for y in range(a.shape[0])] return TimeAvgMapSparkResults(results=results, meta={}, computeOptions=computeOptions)
def calc(self, compute_options, **args): """ :param compute_options: StatsComputeOptions :param args: dict :return: """ request_start_time = datetime.now() metrics_record = self._create_metrics_record() ds, bbox, start_time, end_time, nparts_requested = self.parse_arguments( compute_options) self._setQueryParams(ds, (float(bbox.bounds[1]), float( bbox.bounds[3]), float(bbox.bounds[0]), float(bbox.bounds[2])), start_time, end_time) nexus_tiles = self._find_global_tile_set( metrics_callback=metrics_record.record_metrics) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) print('Found {} tiles'.format(len(nexus_tiles))) daysinrange = self._get_tile_service().find_days_in_range_asc( bbox.bounds[1], bbox.bounds[3], bbox.bounds[0], bbox.bounds[2], ds, start_time, end_time, metrics_callback=metrics_record.record_metrics) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) self.log.debug('nlats={0}, nlons={1}'.format(self._nlats, self._nlons)) self.log.debug('center lat range = {0} to {1}'.format( self._minLatCent, self._maxLatCent)) self.log.debug('center lon range = {0} to {1}'.format( self._minLonCent, self._maxLonCent)) # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. # Set the time boundaries for each of the Spark map tuples so that # every Nth element in the array gets the same time bounds. max_time_parts = 72 num_time_parts = min(max_time_parts, ndays) spark_part_time_ranges = np.tile( np.array([ a[[0, -1]] for a in np.array_split(np.array(daysinrange), num_time_parts) ]), (len(nexus_tiles_spark), 1)) nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # Launch Spark computations spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts) metrics_record.record_metrics(partitions=rdd.getNumPartitions()) sum_count_part = rdd.map( partial(self._map, self._tile_service_factory, metrics_record.record_metrics)) reduce_duration = 0 reduce_start = datetime.now() sum_count = sum_count_part.combineByKey( lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]), lambda x, y: (x[0] + y[0], x[1] + y[1])) reduce_duration += (datetime.now() - reduce_start).total_seconds() avg_tiles = sum_count.map( partial(calculate_means, metrics_record.record_metrics, self._fill)).collect() reduce_start = datetime.now() # Combine subset results to produce global map. # # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. a = np.zeros((self._nlats, self._nlons), dtype=np.float64, order='C') n = np.zeros((self._nlats, self._nlons), dtype=np.uint32, order='C') for tile in avg_tiles: if tile is not None: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_stats) = tile tile_data = np.ma.array([[ tile_stats[y][x]['avg'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_cnt = np.array([[ tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = y0 + tile_data.shape[0] - 1 x0 = self._lon2ind(tile_min_lon) x1 = x0 + tile_data.shape[1] - 1 if np.any(np.logical_not(tile_data.mask)): self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) a[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt else: self.log.debug( 'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) # Store global map in a NetCDF file for debugging purpose # if activated this line is not thread safe and might cause error when concurrent access occurs # self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill) # Create dict for JSON response results = [[{ 'mean': a[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(a.shape[1])] for y in range(a.shape[0])] total_duration = (datetime.now() - request_start_time).total_seconds() metrics_record.record_metrics(actual_time=total_duration, reduce=reduce_duration) metrics_record.print_metrics(self.log) return NexusResults(results=results, meta={}, stats=None, computeOptions=None, minLat=bbox.bounds[1], maxLat=bbox.bounds[3], minLon=bbox.bounds[0], maxLon=bbox.bounds[2], ds=ds, startTime=start_time, endTime=end_time)
def calc(self, request, **args): """ :param request: StatsComputeOptions :param args: dict :return: """ start_time = datetime.now() ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested, normalize_dates = self.parse_arguments( request) metrics_record = self._create_metrics_record() resultsRaw = [] for shortName in ds: the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName, start_seconds_from_epoch, end_seconds_from_epoch, metrics_callback=metrics_record.record_metrics) self.log.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) results, meta = spark_driver(daysinrange, bounding_polygon, shortName, self._tile_service_factory, metrics_record.record_metrics, normalize_dates, spark_nparts=spark_nparts, sc=self._sc) if apply_seasonal_cycle_filter: the_time = datetime.now() # get time series for _clim dataset shortName_clim = shortName + "_clim" daysinrange_clim = self._get_tile_service( ).find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName_clim, 0, SECONDS_IN_ONE_YEAR, metrics_callback=metrics_record.record_metrics) if len(daysinrange_clim) == 0: raise NexusProcessingException( reason= "There is no climatology data present for dataset " + shortName + ".") results_clim, _ = spark_driver(daysinrange_clim, bounding_polygon, shortName_clim, self._tile_service_factory, metrics_record.record_metrics, normalize_dates=False, spark_nparts=spark_nparts, sc=self._sc) clim_indexed_by_month = { datetime.utcfromtimestamp(result['time']).month: result for result in results_clim } if len(clim_indexed_by_month) < 12: raise NexusProcessingException( reason="There are only " + len(clim_indexed_by_month) + " months of climatology data for dataset " + shortName + ". A full year of climatology data is required for computing deseasoned timeseries." ) for result in results: month = datetime.utcfromtimestamp(result['time']).month result['meanSeasonal'] = result[ 'mean'] - clim_indexed_by_month[month]['mean'] result['minSeasonal'] = result[ 'min'] - clim_indexed_by_month[month]['min'] result['maxSeasonal'] = result[ 'max'] - clim_indexed_by_month[month]['max'] self.log.info("Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() filtering.applyAllFiltersOnField( results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() self.log.warn( "Error calculating SeasonalLowPass filter:\n%s" % tb) resultsRaw.append([results, meta]) self.log.info("LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) self.log.info("NetCDF generation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = TimeSeriesSparkHandlerImpl.calculate_comparison_stats( results) except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=None, minLat=bounding_polygon.bounds[1], maxLat=bounding_polygon.bounds[3], minLon=bounding_polygon.bounds[0], maxLon=bounding_polygon.bounds[2], ds=ds, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) total_duration = (datetime.now() - start_time).total_seconds() metrics_record.record_metrics(actual_time=total_duration) metrics_record.print_metrics(logger) self.log.info("Merging results and calculating comparisons took %s" % (str(datetime.now() - the_time))) return res
def calc(self, compute_options, **args): """ :param compute_options: StatsComputeOptions :param args: dict :return: """ ds, bbox, start_time, end_time, nparts_requested = self.parse_arguments( compute_options) self._setQueryParams(ds, (float(bbox.bounds[1]), float( bbox.bounds[3]), float(bbox.bounds[0]), float(bbox.bounds[2])), start_time, end_time) nexus_tiles = self._find_global_tile_set() if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) print('Found {} tiles'.format(len(nexus_tiles))) daysinrange = self._tile_service.find_days_in_range_asc( bbox.bounds[1], bbox.bounds[3], bbox.bounds[0], bbox.bounds[2], ds, start_time, end_time) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) self.log.debug('nlats={0}, nlons={1}'.format(self._nlats, self._nlons)) self.log.debug('center lat range = {0} to {1}'.format( self._minLatCent, self._maxLatCent)) self.log.debug('center lon range = {0} to {1}'.format( self._minLonCent, self._maxLonCent)) # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. # Set the time boundaries for each of the Spark map tuples so that # every Nth element in the array gets the same time bounds. max_time_parts = 72 num_time_parts = min(max_time_parts, ndays) spark_part_time_ranges = np.tile( np.array([ a[[0, -1]] for a in np.array_split(np.array(daysinrange), num_time_parts) ]), (len(nexus_tiles_spark), 1)) nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # Launch Spark computations to calculate x_bar spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts) sum_count_part = rdd.map(self._map) sum_count = \ sum_count_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]), lambda x, y: (x[0] + y[0], x[1] + y[1])) fill = self._fill avg_tiles = \ sum_count.map(lambda (bounds, (sum_tile, cnt_tile)): (bounds, [[(sum_tile[y, x] / cnt_tile[y, x]) if (cnt_tile[y, x] > 0) else fill for x in range(sum_tile.shape[1])] for y in range(sum_tile.shape[0])])).collect() # # Launch a second parallel computation to calculate variance from x_bar # # Create array of tuples to pass to Spark map function - first param are the tile bounds that were in the # results and the last param is the data for the results (x bar) nexus_tiles_spark = [[ t[0], self._startTime, self._endTime, self._ds, t[1] ] for t in avg_tiles] self.log.info('Using {} partitions'.format(spark_nparts)) rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts) anomaly_squared_part = rdd.map(self._calc_variance) anomaly_squared = \ anomaly_squared_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]), lambda x, y: (x[0] + y[0], x[1] + y[1])) variance_tiles = \ anomaly_squared.map(lambda (bounds, (anomaly_squared_tile, cnt_tile)): (bounds, [[{'variance': (anomaly_squared_tile[y, x] / cnt_tile[y, x]) if (cnt_tile[y, x] > 0) else fill, 'cnt': cnt_tile[y, x]} for x in range(anomaly_squared_tile.shape[1])] for y in range(anomaly_squared_tile.shape[0])])).collect() # Combine subset results to produce global map. # # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. a = np.zeros((self._nlats, self._nlons), dtype=np.float64, order='C') n = np.zeros((self._nlats, self._nlons), dtype=np.uint32, order='C') for tile in variance_tiles: if tile is not None: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_stats) = tile tile_data = np.ma.array([[ tile_stats[y][x]['variance'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_cnt = np.array([[ tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = y0 + tile_data.shape[0] - 1 x0 = self._lon2ind(tile_min_lon) x1 = x0 + tile_data.shape[1] - 1 if np.any(np.logical_not(tile_data.mask)): self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) a[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt else: self.log.debug( 'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) # Store global map in a NetCDF file. self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill) # Create dict for JSON response results = [[{ 'variance': a[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(a.shape[1])] for y in range(a.shape[0])] return NexusResults(results=results, meta={}, stats=None, computeOptions=None, minLat=bbox.bounds[1], maxLat=bbox.bounds[3], minLon=bbox.bounds[0], maxLon=bbox.bounds[2], ds=ds, startTime=start_time, endTime=end_time)
def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=False, applyLowPass=False): daysinrange = self._tile_service.find_days_in_range_asc( min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) if len(daysinrange) == 0: raise NoDataException( reason="No data found for selected timeframe") print 'Found %d days in range' % len(daysinrange) cwd = os.getcwd() # Configure Spark sp_conf = SparkConf() sp_conf.setAppName("Spark Time Avg Map") sp_conf.set("spark.executorEnv.HOME", os.path.join(os.getenv('HOME'), 'spark_exec_home')) sp_conf.set("spark.executorEnv.PYTHONPATH", cwd) #sp_conf.set("spark.yarn.executor.memoryOverhead", "4000") sp_conf.set("spark.executor.memory", "4g") #num_parts = 1 #num_parts = 16 #num_parts = 32 #num_parts = 64 num_parts = 128 #num_execs = 1 #num_execs = 16 #num_execs = 32 num_execs = 64 cores_per_exec = 1 sp_conf.setMaster("yarn-client") #sp_conf.setMaster("local[16]") #sp_conf.setMaster("local[1]") sp_conf.set("spark.executor.instances", num_execs) sp_conf.set("spark.executor.cores", cores_per_exec) #print sp_conf.getAll() sc = SparkContext(conf=sp_conf) nexus_tiles_spark = [ (min_lat, max_lat, min_lon, max_lon, ds, list(daysinrange_part), cwd) for daysinrange_part in np.array_split(daysinrange, num_parts) ] #for tile in nexus_tiles_spark: # print tile # Launch Spark computations rdd = sc.parallelize(nexus_tiles_spark, num_parts) results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect() # results = list(itertools.chain.from_iterable(results)) results = sorted(results, key=lambda entry: entry["time"]) #filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) #filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) #filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean') return results, {}
def calc(self, request, **args): """ :param request: StatsComputeOptions :param args: dict :return: """ ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested = self.parse_arguments( request) resultsRaw = [] for shortName in ds: the_time = datetime.now() daysinrange = self._tile_service.find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName, start_seconds_from_epoch, end_seconds_from_epoch) self.log.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) the_time = datetime.now() results, meta = spark_driver(daysinrange, bounding_polygon, shortName, spark_nparts=spark_nparts, sc=self._sc) self.log.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) if apply_seasonal_cycle_filter: the_time = datetime.now() for result in results: month = datetime.utcfromtimestamp(result['time']).month month_mean, month_max, month_min = self.calculate_monthly_average( month, bounding_polygon.wkt, shortName) seasonal_mean = result['mean'] - month_mean seasonal_min = result['min'] - month_min seasonal_max = result['max'] - month_max result['meanSeasonal'] = seasonal_mean result['minSeasonal'] = seasonal_min result['maxSeasonal'] = seasonal_max self.log.info("Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() filtering.applyAllFiltersOnField( results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() self.log.warn( "Error calculating SeasonalLowPass filter:\n%s" % tb) resultsRaw.append([results, meta]) self.log.info("LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) self.log.info("NetCDF generation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = TimeSeriesHandlerImpl.calculate_comparison_stats( results) except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=None, minLat=bounding_polygon.bounds[1], maxLat=bounding_polygon.bounds[3], minLon=bounding_polygon.bounds[0], maxLon=bounding_polygon.bounds[2], ds=ds, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) self.log.info("Merging results and calculating comparisons took %s" % (str(datetime.now() - the_time))) return res