def __init__(self, layout, crs=None, extent=None, cellsize=None, dimensions=None): self.__jvm = gps.get_spark_context()._gateway.jvm if isinstance(layout, gps.LocalLayout): if not extent: raise ValueError("Must specify an extent when using LocalLayout") if dimensions and not cellsize: cellsize = ((extent.xmax - extent.xmin)/dimensions[0], (extent.ymax - extent.ymin)/dimensions[1]) dimensions = None if cellsize and not dimensions: tilewidth = layout.tile_cols * cellsize[0] tileheight = layout.tile_rows * cellsize[1] rows = ceil((extent.xmax - extent.xmin) / tilewidth) cols = ceil((extent.ymax - extent.ymin) / tileheight) extent = gps.Extent(extent.xmin, extent.ymax - rows * tileheight, extent.xmin + cols * tilewidth, extent.ymax) tl = gps.TileLayout(cols, rows, layout.tile_cols, layout.tile_rows) else: raise ValueError("For LocalLayout, must specify exactly one: cellsize or dimension") elif isinstance(layout, gps.GlobalLayout): try: from pyproj import Proj, transform except: raise ImportError('pyproj is required for GlobalLayout') if not layout.zoom: raise ValueError("Must specify a zoom level when using GlobalLayout") if not crs: raise ValueError("Must specify a crs when using GlobalLayout") if isinstance(crs, int): crs = "{}".format(crs) gtcrs = self.__jvm.geopyspark.geotrellis.TileLayer.getCRS(crs).get() if gtcrs.epsgCode().isDefined() and gtcrs.epsgCode().get() == 3857: extent = WEB_MERCATOR elif gtcrs.epsgCode().isDefined() and gtcrs.epsgCode().get() == 4326: extent = LATLNG else: llex = LATLNG proj4str = gtcrs.toProj4String() target = Proj(proj4str) xmin, ymin = target(llex.xmin, llex.ymin) xmax, ymax = target(llex.xmax, llex.ymax) extent = gps.Extent(xmin, ymin, xmax, ymax) layout_rows_cols = int(pow(2, layout.zoom)) tl = gps.TileLayout(layout_rows_cols, layout_rows_cols, layout.tile_size, layout.tile_size) elif isinstance(layout, gps.LayoutDefinition): extent = layout.extent tl = layout.tileLayout ex = self.__jvm.geotrellis.vector.Extent(float(extent.xmin), float(extent.ymin), float(extent.xmax), float(extent.ymax)) tilelayout = self.__jvm.geotrellis.raster.TileLayout(int(tl[0]), int(tl[1]), int(tl[2]), int(tl[3])) self.layout = gps.LayoutDefinition(extent, tl) self.__layout = self.__jvm.geotrellis.spark.tiling.LayoutDefinition(ex, tilelayout)
class KeyTransformTest(BaseTestClass): layout = gps.LayoutDefinition(gps.Extent(0, 0, 1, 1), gps.TileLayout(5, 5, 2, 2)) def test_key_to_extent(self): kt_layout = gps.KeyTransform(self.layout) self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0), kt_layout.key_to_extent(gps.SpatialKey(0, 0))) kt_local = gps.KeyTransform(gps.LocalLayout(2), extent=gps.Extent(0, 0, 1, 1), dimensions=(10, 10)) self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0), kt_layout.key_to_extent(gps.SpatialKey(0, 0))) kt_global = gps.KeyTransform(gps.GlobalLayout(zoom=1), crs=4326) nw_global_extent = kt_global.key_to_extent(gps.SpatialKey(0, 0)) self.assertTrue( abs(nw_global_extent.xmin + 180.0) <= 1e-4 and abs(nw_global_extent.xmax) <= 1e-4 and abs(nw_global_extent.ymin) <= 1e-4 and abs(nw_global_extent.ymax - 90) <= 1e-4) def test_extent_to_key(self): kt = gps.KeyTransform(self.layout) self.assertTrue( set(kt.extent_to_keys(gps.Extent(0, 0, 0.4, 0.4))) == set( [gps.SpatialKey(x, y) for x in [0, 1] for y in [3, 4]])) def test_geom_to_key(self): kt = gps.KeyTransform(self.layout) self.assertTrue( kt.geometry_to_keys(Point(0.1, 0.1)) == [gps.SpatialKey(0, 4)])
def windows(line, ws): for w in ws: ((row_start, row_stop), (col_start, col_stop)) = w area_of_interest = box(-122.47678756713866, 37.80924146650164, -122.46288299560545, 37.80490143094975) new_line['projected_extent'] = gps.TemporalProjectedExtent( extent=extent, instant=instant, proj4=proj4, geometries=area_of_interest) left = bounds.left + (bounds.right - bounds.left) * (float(col_start) / width) right = bounds.left + (bounds.right - bounds.left) * (float(col_stop) / width) bottom = bounds.top + (bounds.bottom - bounds.top) * (float(row_stop) / height) top = bounds.top + (bounds.bottom - bounds.top) * (float(row_start) / height) extent = gps.Extent(left, bottom, right, top) instant = datetime.strptime(line['date'], '%Y%j') new_line = line.copy() new_line.pop('date') new_line.pop('scene_id') new_line['window'] = w yield new_line
def extent_for_cell(layout, cell): """ Compute the geodetic extent of a specific tile in a layout Args: layout (``gps.LayoutDefinition``) cell (``gps.SpatialKey`` or ``gps.SpaceTimeKey``) Returns: ``gps.Extent`` """ if isinstance(cell, (gps.SpatialKey, gps.SpaceTimeKey)): col = cell.col row = cell.row elif isinstance(cell, tuple): col = cell[0] row = cell[1] else: raise TypeError( "extent_for_cell() expects SpatialKey, SpaceTimeKey, or tuple") w = (layout.extent.xmax - layout.extent.xmin) / layout.tileLayout.layoutCols h = (layout.extent.ymax - layout.extent.ymin) / layout.tileLayout.layoutRows x0 = layout.extent.xmin + col * w y0 = layout.extent.ymax - (row + 1) * h return gps.Extent(x0, y0, x0 + w, y0 + h)
def _read_windows(uri, xcols, ycols, bands, crs_to_proj4): if ("GDAL_DATA" not in os.environ) and (_GDAL_DATA is not None): os.environ["GDAL_DATA"] = _GDAL_DATA with rasterio.open(uri) as dataset: bounds = dataset.bounds height = dataset.height width = dataset.width proj4 = crs_to_proj4(dataset.get_crs()) nodata = dataset.nodata tile_cols = (int)(math.ceil(width / xcols)) * xcols tile_rows = (int)(math.ceil(height / ycols)) * ycols windows = [((x, min(width - 1, x + xcols)), (y, min(height - 1, y + ycols))) for x in range(0, tile_cols, xcols) for y in range(0, tile_rows, ycols)] for window in windows: ((row_start, row_stop), (col_start, col_stop)) = window left = bounds.left + (bounds.right - bounds.left) * (float(col_start) / width) right = bounds.left + (bounds.right - bounds.left) * (float(col_stop) / width) bottom = bounds.top + (bounds.bottom - bounds.top) * (float(row_stop) / height) top = bounds.top + (bounds.bottom - bounds.top) * (float(row_start) / height) extent = gps.Extent(left, bottom, right, top) projected_extent = gps.ProjectedExtent(extent=extent, proj4=proj4) data = dataset.read(bands, window=window) tile = gps.Tile.from_numpy_array(data, no_data_value=nodata) yield (projected_extent, tile)
def get_raster_layer(sc, path): jp2s = ["B02.jp2", "B03.jp2", "B04.jp2"] arrs = [] for jp2 in jp2s: with rasterio.open(path + jp2) as f: arrs.append(f.read(1)) data = np.array(arrs, dtype=arrs[0].dtype) # Create an Extent instance from rasterio's bounds extent = gps.Extent(*f.bounds) # The EPSG code can also be obtained from the information read in via rasterio projected_extent = gps.ProjectedExtent(extent=extent, epsg=int( f.crs.to_dict()['init'][5:])) # We can create a Tile instance from our multiband, raster array and the nodata value from rasterio tile = gps.Tile.from_numpy_array(numpy_array=data, no_data_value=f.nodata) # Now that we have our ProjectedExtent and Tile, we can create our RDD from them rdd = sc.parallelize([(projected_extent, tile)]) # While there is a time component to the data, this was ignored for this tutorial and # instead the focus is just on the spatial information. Thus, we have a LayerType of SPATIAL. raster_layer = gps.RasterLayer.from_numpy_rdd( layer_type=gps.LayerType.SPATIAL, numpy_rdd=rdd) return raster_layer
def test_key_to_extent(self): kt_layout = gps.KeyTransform(self.layout) self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0), kt_layout.key_to_extent(gps.SpatialKey(0, 0))) kt_local = gps.KeyTransform(gps.LocalLayout(2), extent=gps.Extent(0, 0, 1, 1), dimensions=(10, 10)) self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0), kt_layout.key_to_extent(gps.SpatialKey(0, 0))) kt_global = gps.KeyTransform(gps.GlobalLayout(zoom=1), crs=4326) nw_global_extent = kt_global.key_to_extent(gps.SpatialKey(0, 0)) self.assertTrue( abs(nw_global_extent.xmin + 180.0) <= 1e-4 and abs(nw_global_extent.xmax) <= 1e-4 and abs(nw_global_extent.ymin) <= 1e-4 and abs(nw_global_extent.ymax - 90) <= 1e-4)
def get_slice_indexes_and_extent(nc_file, geojson_shape): """ Calculates x/y slice indexes in the nc file for the given shape. :param nc_file: NetCDF File :param geojson_shape: Requested shape :return: x/y-indexes of shape bounding box, geopyspark extent of bounding box, geojson features as polygons in x/y coordinates """ lat_array = nc_file['lat'][:] lon_array = nc_file['lon'][:] # Transform the geojson into shapes. We need the shapes represented both as # indices into the lat-/lon-arrays (to read only the required slices from NetCDF) # and as x-/y-values (to mask the constructed layout). x_coords = nc_file['rlon'][:] y_coords = nc_file['rlat'][:] mask_shapes_indices = [] mask_shapes_xy = [] for feature in geojson_shape: # Get each vertex's index in the lat- and lon-arrays vertex_indices = np.array( list( get_indexes(lat_array, lon_array, lon_array.shape, vertex[1], vertex[0]) for vertex in feature['geometry']['coordinates'][0])) mask_shapes_indices.append(vertex_indices) # Get the corresponding x and y values vertex_xs = x_coords[np.array(vertex_indices)[:, 1]] vertex_ys = y_coords[np.array(vertex_indices)[:, 0]] # Transform into a polygon polygon = Polygon(zip(vertex_xs, vertex_ys)) mask_shapes_xy.append(polygon) # Get the slices to read from NetCDF y_slice_start = int(min(s[:, 0].min() for s in mask_shapes_indices)) x_slice_start = int(min(s[:, 1].min() for s in mask_shapes_indices)) y_slice_stop = int(max(s[:, 0].max() for s in mask_shapes_indices)) x_slice_stop = int(max(s[:, 1].max() for s in mask_shapes_indices)) x_min = float(min(s.bounds[0] for s in mask_shapes_xy)) y_min = float(min(s.bounds[1] for s in mask_shapes_xy)) x_max = float(max(s.bounds[2] for s in mask_shapes_xy)) y_max = float(max(s.bounds[3] for s in mask_shapes_xy)) extent = gps.Extent(x_min, y_min, x_max, y_max) return x_slice_start, x_slice_stop, y_slice_start, y_slice_stop, extent, mask_shapes_xy
def map_ndvi(M, img, bounds, crs): # Start a spark context if needed init_sc() # Color ramp for NDVI ndvi_breaks_dict = { 0.05: 0xffffe5aa, 0.1: 0xf7fcb9ff, 0.2: 0xd9f0a3ff, 0.3: 0xaddd8eff, 0.4: 0x78c679ff, 0.5: 0x41ab5dff, 0.6: 0x238443ff, 0.7: 0x006837ff, 1.0: 0x004529ff } ndvi_color_map = gps.ColorMap.from_break_map(ndvi_breaks_dict) # Convert the CRS into a proj4 string srs = osr.SpatialReference() srs.ImportFromWkt(crs.wkt) proj4 = srs.ExportToProj4() # Create the projected extent projected_extent = gps.ProjectedExtent(gps.Extent(bounds.left, bounds.bottom, bounds.right, bounds.top), proj4=proj4) tiles = sc.parallelize([(projected_extent, gps.Tile.from_numpy_array(img, no_data_value=0.0))]) raster_layer = gps.geotrellis.RasterLayer.from_numpy_rdd( gps.LayerType.SPATIAL, tiles) tiled_raster_layer = raster_layer.tile_to_layout( gps.GlobalLayout(), target_crs=3857, partition_strategy=gps.HashPartitionStrategy(40)) pyramid = tiled_raster_layer.pyramid( resample_method=gps.ResampleMethod.BILINEAR) tms = gps.TMS.build(pyramid, ndvi_color_map) M.add_layer(TMSRasterData(tms), name="ndvi")
def key_to_extent(self, key, *args): """Returns the Extent corresponding to a given key. Args: key (:class:`~geopyspark.geotrellis.SpatialKey` or :class:`~geopyspark.geotrellis.SpaceTimeKey` or int): The key to find the extent for. If of type int, then this parameter is the column of the key, and the call must provide a single additional int value in the args parameter to serve as the row of the key. Returns: :class:`~geopyspark.geotrellis.Extent` """ if isinstance(key, (gps.SpatialKey, gps.SpaceTimeKey)): skey = self.__jvm.geotrellis.spark.SpatialKey(key.col, key.row) elif isinstance(key, tuple): skey = self.__jvm.geotrellis.spark.SpatialKey(key[0], key[1]) elif isinstance(key, int) and len(args) == 1 and isinstance(args[0], int): skey = self.__jvm.geotrellis.spark.SpatialKey(key, args[0]) else: raise ValueError("Please supply either gps.SpatialKey, gps.SpaceTimeKey, (int, int), or two ints") ex = self.__layout.mapTransform().apply(skey) return gps.Extent(ex.xmin(), ex.ymin(), ex.xmax(), ex.ymax())
def buffered_cell_extent(layout, px_buffer, cell): """ Compute the extent of a cell in a layout with a buffer This funtion computes the extent of a cell and adds ``px_buffer`` worth of area on all sides. That is, if the tile dimension in a given layout is n x n pixels, then this function returns the extent for a ``(n + 2 * px_buffer)`` square region centered on the given cell. Args: layout (``gps.LayoutDefinition``) px_buffer (int): number of pixels to pad the border of the extent with cell (``gps.SpatialKey`` or ``gps.SpaceTimeKey``): identifier of the desired layout cell Returns: ``gps.Extent`` """ ex = extent_for_cell(layout, cell) cx, cy = cell_size(layout) return gps.Extent(ex.xmin - cx * px_buffer, ex.ymin - cy * px_buffer, ex.xmax + cx * px_buffer, ex.ymax + cy * px_buffer)
def convert_extent(extent_sc: JavaObject) -> geopyspark.Extent: return geopyspark.Extent(extent_sc.xmin(), extent_sc.ymin(), extent_sc.xmax(), extent_sc.ymax())
def test_extent_to_key(self): kt = gps.KeyTransform(self.layout) self.assertTrue( set(kt.extent_to_keys(gps.Extent(0, 0, 0.4, 0.4))) == set( [gps.SpatialKey(x, y) for x in [0, 1] for y in [3, 4]]))
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog, probability_images, seed, config_filename): """The primary script Args: spark (``pyspark.sql.SparkSession``) logger (``py4j.JavaObject``) s3_bucket (str): Name of the S3 bucket to search for configuration objects and save results to run_id (str): The identifier of the current run aoi_id (str): The identifier for the current area of interest probability_images (int): The number of tiles to save the generated probability images for seed (int): A random seed used to sample the probability images, for reproducability Required external inputs: <s3_bucket>/cvmapper_config.yaml under ``learner`` key: prefix: The S3 prefix under which CSVs can be read and written pool: Name of CSV file under s3_bucket/prefix giving the comprehensive list of active grid cells incoming_names: Name of CSV file under s3_bucket/prefix giving list of cells used for training/validation image_catalog: Name of CSV file under s3_bucket giving catalog of imagery image_output_pattern: URI pattern used for output of probability images. Must contain two '{}' tokens to be replaced by the column and row for the relevant cell outgoing: S3 URI to save the CSV of worst-performing cells to location pool: A CSV of ``name``, ``col``, ``row`` for each grid cell under consideration. Identified by ``pool`` parameter above. incoming names: CSV containing (at least) ``name``, ``iteration``, and ``usage`` columns. Every name in this file must also be contained in the image pool. Location of this file given in YAML file. image catalog: A CSV minimally containing ``col``, ``row``, ``season``, and ``uri`` columns. Season is either 'GS' or 'OS'. Every grid cell in the location pool must be contained here, and must have an entry for both seasons. URI points to TIFF that completely covers listed cell with valid image data (no NODATA values). Note: Grid cells are defined according to the master_layout object, which specifies a rectangular extent in long/lat coords. This extent is subdivided into cells (in this case, 13792 columns and 14477 rows). Each cell is then given a pixel resolution (in this case 200x200, but whatever is chosen must match the resolution of the label images provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif`` files identified by the incoming names CSV). When we refer to tiles, we mean image chips of the stated resolution, indexed by ``gps.SpatialKey`` objects. The key is a col/row pair where row=0, col=0 corresponds to the chip in the upper left corner of the bounding extent. Note: Grid cell names for the output probability images (`image_output_pattern`) are relative to a different, coarser layout. These grid cell ids need not be clearly defined, since the output of this process is simply a bucket of COGs for display using another tool. However, see the `coarse_layout` definition below for specific details of the layout. """ params = parse_yaml_from_s3(s3_bucket, config_filename)['learner'] label_path = parse_yaml_from_s3( s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1] s3_prefix = params['prefix'] s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix catalog_prefix = params['image_catalog'] catalog_prefix_fix = params['image_catalog_fix'] feature_names = functools.reduce(lambda a, b: a + b, [[ "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n), "{}_std_{}".format(season, n) ] for season in ["GS", "OS"] for n in range(1, 5)]) master_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(13800, 14600, 200, 200)) master_metadata = gps.Metadata( gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)), "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8, master_layout.extent, master_layout) #################################### logger.warn("Reading source tables") checkpoint = time.time() f_pool = spark\ .read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\ .repartition('col', 'row') qs_in = spark \ .read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \ .repartition('col', 'row') incoming = spark.read\ .option('header', True)\ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ]))\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names'])) # merge incoming_names and incoming_names_static incoming = incoming.union(spark.read \ .option('header', True) \ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ])) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static']))) incoming = incoming.filter(incoming['run'] == params['runid']).filter( incoming['label'] == True) test_names = f_pool.join(incoming.select('name'), 'name', 'left_anti').withColumn("usage", lit("test")) all_names = f_pool.join(incoming.select('name', 'usage'), f_pool.name == incoming.name, how='left')\ .select(f_pool.name.alias('name'), 'col', 'row', 'usage') num_test_images = test_names.count() image_catalog = spark.read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\ .repartition('col', 'row') all_image_uris = image_catalog\ .filter(image_catalog['season'] == 'GS')\ .alias('gs')\ .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) logger.warn( "Elapsed time for reading source tables: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Reading training labels & building training features") checkpoint = time.time() training_data = gather_data(all_image_uris, all_names.filter(all_names.usage == 'train'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) training_data.show() logger.warn( "Elapsed time for reading training labels and feature building: {}s". format(time.time() - checkpoint)) #################################### logger.warn("Balancing data") checkpoint = time.time() balanced_data = balance_samples(spark, training_data, 'mask') balanced_data.show() logger.warn("Elapsed time for balancing data: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Training model") checkpoint = time.time() pipeline = ml_pipeline(feature_names, 'mask') model = pipeline.fit(balanced_data) print(model) logger.warn("Elapsed time for training the model: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Validating model results") checkpoint = time.time() validation_data = gather_data( all_image_uris, all_names.filter(all_names.usage == 'validate'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) valid_fit = model.transform(validation_data).select( 'prediction', 'probability', 'mask') metrics = MulticlassMetrics( valid_fit.rdd.map(lambda r: (r.prediction, r.mask))) confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist( ) #left to right, top to bottom tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \ 1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1 binmetrics = BinaryClassificationMetrics( valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask']))) last_iteration = incoming.agg(F.max('iteration')).collect()[0][0] report = pd.DataFrame({ 'run': [run_id], 'iteration': [last_iteration + 1], 'tss': [tss], 'accuracy': [metrics.accuracy], 'precision': [metrics.precision(1.0)], 'recall': [metrics.recall(1.0)], 'fpr': [metrics.falsePositiveRate(1.0)], 'tpr': [metrics.truePositiveRate(1.0)], 'AUC': [binmetrics.areaUnderROC], 'aoi': [aoi_name], 'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')] }) # TODO: allow target location to be derived from params (local or s3) # added because of an error where incoming_metrics.csv contained different iteration number (10) # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv # print("############Old Iteration Metrics to overwrite###########") # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics'])) # print(incoming_previous.to_string()) # print("############New Iteration Metrics to use to overwrite###########") # print(report.to_string()) pd_df_to_s3_csv(report, s3_bucket, os.path.join(s3_prefix, params['metrics'])) logger.warn( "Elapsed time for validating and saving metrics to s3: {}s".format( time.time() - checkpoint)) #################################### logger.warn("Classifying test data") checkpoint = time.time() filtered_names = test_names.filter(test_names.usage == "test") # filtered_names.cache() # filtered_names.show() test_features = gather_data(all_image_uris, filtered_names, master_metadata, feature_names, s3_bucket) test_features_sample = test_features.sample(True, 0.1) fitted = model.transform(test_features_sample).select( 'spatial_key', 'column_index', 'row_index', 'probability', 'prediction') # fitted.cache() # fitted.show() grouped = fitted.groupBy('spatial_key') # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses: # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability # (This did not work without the UDF!) firstelement = F.udf(lambda v: float(v[0]), FloatType()) # added this UDF to select the probability of field rather than no field to write to probability images secondelement = F.udf(lambda v: float(v[1]), FloatType()) logger.warn( "Elapsed time for classifying test grids: {}s".format(time.time() - checkpoint)) #################################### if probability_images > 0 or complete_catalog: logger.warn("Write catalog of {} probability images".format( probability_images)) checkpoint = time.time() if complete_catalog: # new catalog image_catalog_fix = spark.read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \ .repartition('col', 'row') all_image_uris_fix = image_catalog_fix \ .filter(image_catalog_fix['season'] == 'GS') \ .alias('gs') \ .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) #recollect all pixels for all testing images compreh_names = f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer') features_compreh = gather_data(all_image_uris_fix, compreh_names, master_metadata, feature_names, s3_bucket) fitted_compreh = model.transform(features_compreh)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_compreh = fitted_compreh.groupBy('spatial_key') # added to test sampling assembled = grouped_compreh.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) else: #################################### logger.warn("Identify worst performing cells") checkpoint = time.time() # TODO: Determine which images to take certainty = grouped \ .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache() certainty.show() worst_keys_rdd = certainty \ .sort('certainty') \ .select('spatial_key') \ .limit(round(certainty.count() * 0.05)) \ .rdd.takeSample(False, (params['number_outgoing_names'])) worst_keys = spark.createDataFrame(worst_keys_rdd) outgoing_names = worst_keys \ .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \ .select('name') \ .withColumn('run', lit(run_id)) \ .withColumn('iteration', lit(last_iteration + 1)) \ .withColumn('processed', lit(False)) \ .withColumn('usage', lit('train')) \ .toPandas() uri = urlparse.urlparse(params['outgoing']) pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:]) logger.warn( "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s" .format(time.time() - checkpoint)) ########################################### checkpoint = time.time() # sampling testing images (num = probability_images) filtered_names_sample = filtered_names\ .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\ .select('scene_id')\ .dropDuplicates()\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\ .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\ .select('name', 'col', 'row', 'name_col_row') #re-collect all pixels within sampled images features_images = gather_data(all_image_uris, filtered_names_sample, master_metadata, feature_names, s3_bucket) #reclassify sampled testing images fitted_images = model.transform(features_images)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_sample = fitted_images.join( filtered_names_sample, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))).groupby('spatial_key') assembled = grouped_sample.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) coarse_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(1380, 1460, 2000, 2000)) # we multiply by 100 to select digits that will be kept after converting from float to int. # range of int8 is to 128, so we can only preserve 2 sig figs output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\ .tile_to_layout(coarse_layout)\ .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED) cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[ 'image_output_pattern'] output_tiles.foreach(lambda pair: write_bytes_to_s3( cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id, str(last_iteration + 1)), pair[1])) logger.warn( "Elapsed time for writing catalog of probability images: {}s". format(time.time() - checkpoint))
def load_test_collection( collection_id: str, collection_metadata: GeopysparkCubeMetadata, extent, srs: str, from_date: str, to_date: str, bands=None, correlation_id: str = "NA", ) -> Dict[int, geopyspark.TiledRasterLayer]: """ Load synthetic data as test collection :param collection_id: :param collection_metadata: :param extent: :param srs: :param from_date: :param to_date: :param bands: :param correlation_id: :return: """ # TODO: support more test collections assert collection_id == "TestCollection-LonLat4x4" grid_size: float = 1.0 tile_size = 4 # TODO: support other srs'es? assert srs == "EPSG:4326" # Get bounds of tiling layout extent = geopyspark.Extent(extent.xmin(), extent.ymin(), extent.xmax(), extent.ymax()) col_min = int(math.floor(extent.xmin / grid_size)) row_min = int(math.floor(extent.ymin / grid_size)) col_max = int(math.ceil(extent.xmax / grid_size) - 1) row_max = int(math.ceil(extent.ymax / grid_size) - 1) # Simulate sparse range of observation dates from_date = rfc3339.parse_datetime(rfc3339.datetime(from_date)) to_date = rfc3339.parse_datetime(rfc3339.datetime(to_date)) dates = dates_between(from_date, to_date) # Build RDD of tiles with requested bands. tile_builder = TestCollectionLonLat(tile_size=tile_size, grid_size=grid_size) bands = bands or [b.name for b in collection_metadata.bands] rdd_data = [(SpaceTimeKey(col, row, date), tile_builder.get_tile(bands=bands, col=col, row=row, date=date)) for col in range(col_min, col_max + 1) for row in range(row_min, row_max + 1) for date in dates] rdd = SparkContext.getOrCreate().parallelize(rdd_data) metadata = Metadata( bounds=Bounds(SpaceTimeKey(col_min, row_min, min(dates)), SpaceTimeKey(col_max, row_max, max(dates))), crs="+proj=longlat +datum=WGS84 +no_defs ", cell_type=CellType.FLOAT64, extent=extent, layout_definition=LayoutDefinition( extent=geopyspark.Extent(col_min * grid_size, row_min * grid_size, (col_max + 1) * grid_size, (row_max + 1) * grid_size), tileLayout=TileLayout(layoutCols=col_max - col_min + 1, layoutRows=row_max - row_min + 1, tileCols=tile_size, tileRows=tile_size))) layer = TiledRasterLayer.from_numpy_rdd(LayerType.SPACETIME, rdd, metadata) return {0: layer}