def load_crophealth_data(): """ Loads Sentinel-2 analysis-ready data (ARD) product for the crop health case-study area. The ARD product is provided for the last year. Last modified: January 2020 outputs ds - data set containing combined, masked data from Sentinel-2a and -2b. Masked values are set to 'nan' """ # Suppress warnings warnings.filterwarnings('ignore') # Initialise the data cube. 'app' argument is used to identify this app dc = datacube.Datacube(app='Crophealth-app') # Specify latitude and longitude ranges latitude = (-24.974997, -24.995971) longitude = (152.429994, 152.395805) # Specify the date range # Calculated as today's date, subtract 90 days to match NRT availability # Dates are converted to strings as required by loading function below end_date = dt.date.today() start_date = end_date - dt.timedelta(days=365) time = (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")) # Construct the data cube query products = ["s2a_ard_granule", "s2b_ard_granule"] query = { 'x': longitude, 'y': latitude, 'time': time, 'measurements': [ 'nbar_red', 'nbar_green', 'nbar_blue', 'nbar_nir_1', 'nbar_swir_2', 'nbar_swir_3' ], 'output_crs': 'EPSG:3577', 'resolution': (-10, 10) } # Load the data and mask out bad quality pixels ds_s2 = load_ard(dc, products=products, min_gooddata=0.5, **query) # Calculate the normalised difference vegetation index (NDVI) across # all pixels for each image. # This is stored as an attribute of the data ds_s2 = calculate_indices(ds_s2, index='NDVI', collection='ga_s2_1') # Return the data return (ds_s2)
"output_crs": "EPSG:32755", "resolution": (-10, 10) } prefire_data = load_ard( dc=dc, products=['s2a_ard_granule', 's2b_ard_granule'], measurements=['nbart_nir_1', 'nbart_swir_3'], min_gooddata=0, # dask_chunks={'x': 'auto', 'y': 'auto'}, group_by='solar_day', **query_1) prefire_image = prefire_data.median(dim='time') prefire_image = calculate_indices(prefire_image, index='NBR', collection='ga_s2_1', drop=False) prefire_burnratio = prefire_image.NBR prefire_burnratio.data query_2 = { "x": (central_lon - buffer, central_lon + buffer), "y": (central_lat - buffer, central_lat + buffer), "time": (postfire_start, postfire_end), "output_crs": "EPSG:32755", "resolution": (-10, 10) } postfire_data = load_ard( dc=dc, products=['s2a_ard_granule', 's2b_ard_granule'],
def get_training_data_for_shp(gdf, index, row, out_arrs, out_vars, products, dc_query, custom_func=None, field=None, calc_indices=None, reduce_func=None, drop=True, zonal_stats=None): """ Function to extract data from the ODC for training a machine learning classifier using a geopandas geodataframe of labelled geometries. This function provides a number of pre-defined methods for producing training data, including: calcuating band indices, reducing time series using summary statistics, and/or generating zonal statistics across polygons. The 'custom_func' parameter provides a method for the user to supply a custom function for generating features rather than using the pre-defined methods. Parameters ---------- gdf : geopandas geodataframe geometry data in the form of a geopandas geodataframe products : list a list of products to load from the datacube. e.g. ['ga_ls7e_ard_3', 'ga_ls8c_ard_3'] dc_query : dictionary Datacube query object, should not contain lat and long (x or y) variables as these are supplied by the 'gdf' variable field : string A string containing the name of column with class labels. Field must contain numeric values. out_arrs : list An empty list into which the training data arrays are stored. out_vars : list An empty list into which the data varaible names are stored. custom_func : function, optional A custom function for generating feature layers. If this parameter is set, all other options (excluding 'zonal_stats'), will be ignored. The result of the 'custom_func' must be a single xarray dataset containing 2D coordinates (i.e x, y - no time dimension). The custom function has access to the datacube dataset extracted using the 'dc_query' params. Example custom function to return multiple products: `def custom_function(ds): dc = datacube.Datacube(app='custom_function') mad = dc.load(product='ls8_nbart_tmad_annual', like=ds.geobox) output = xr.merge([ds, mad]) return output` calc_indices: list, optional If not using a custom func, then this parameter provides a method for calculating a number of remote sensing indices (e.g. `['NDWI', 'NDVI']`). reduce_func : string, optional Function to reduce the data from multiple time steps to a single timestep. Options are 'mean', 'median', 'std', 'max', 'min', 'geomedian'. Ignored if 'custom_func' is provided. drop : boolean, optional If this variable is set to True, and 'calc_indices' are supplied, the spectral bands will be dropped from the dataset leaving only the band indices as data variables in the dataset. Default is True. zonal_stats : string, optional An optional string giving the names of zonal statistics to calculate for each polygon. Default is None (all pixel values are returned). Supported values are 'mean', 'median', 'max', 'min', and 'std'. Will work in conjunction with a 'custom_func'. Returns -------- Two lists, a list of numpy.arrays containing classes and extracted data for each pixel or polygon, and another containing the data variable names. """ # prevent function altering dictionary kwargs dc_query = deepcopy(dc_query) # remove dask chunks if supplied as using # mulitprocessing for parallelization if 'dask_chunks' in dc_query.keys(): dc_query.pop('dask_chunks', None) # connect to datacube dc = datacube.Datacube(app='training_data') # set up query based on polygon (convert to albers) geom = geometry.Geometry(gdf.geometry.values[index].__geo_interface__, geometry.CRS('epsg:3577')) q = {"geopolygon": geom} # merge polygon query with user supplied query params dc_query.update(q) # load_ard doesn't handle derivative products, so check # products aren't one of those below others = [ 'ls5_nbart_geomedian_annual', 'ls7_nbart_geomedian_annual', 'ls8_nbart_geomedian_annual', 'ls5_nbart_tmad_annual', 'ls7_nbart_tmad_annual', 'ls8_nbart_tmad_annual', 'landsat_barest_earth', 'ls8_barest_earth_albers' ] if products[0] in others: ds = dc.load(product=products[0], **dc_query) ds = ds.where(ds != 0, np.nan) else: # load data with HiddenPrints(): ds = load_ard(dc=dc, products=products, output_crs='EPSG:3577', **dc_query) # create polygon mask with HiddenPrints(): mask = xr_rasterize(gdf.iloc[[index]], ds) # Use custom function for training data if it exists if custom_func is not None: with HiddenPrints(): data = custom_func(ds) # Mask dataset data = data.where(mask) else: # Mask dataset ds = ds.where(mask) # first check enough variables are set to run functions if (len(ds.time.values) > 1) and (reduce_func == None): raise ValueError( "You're dataset has " + str(len(ds.time.values)) + " time-steps, please provide a reduction function," + " e.g. reduce_func='mean'") if calc_indices is not None: # determine which collection is being loaded if products[0] in others: collection = 'ga_ls_2' elif '3' in products[0]: collection = 'ga_ls_3' elif 's2' in products[0]: collection = 'ga_s2_1' if len(ds.time.values) > 1: if reduce_func in ['mean', 'median', 'std', 'max', 'min']: with HiddenPrints(): data = calculate_indices(ds, index=calc_indices, drop=drop, collection=collection) # getattr is equivalent to calling data.reduce_func method_to_call = getattr(data, reduce_func) data = method_to_call(dim='time') elif reduce_func == 'geomedian': data = GeoMedian().compute(ds) with HiddenPrints(): data = calculate_indices(data, index=calc_indices, drop=drop, collection=collection) else: raise Exception( reduce_func + " is not one of the supported" + " reduce functions ('mean','median','std','max','min', 'geomedian')" ) else: with HiddenPrints(): data = calculate_indices(ds, index=calc_indices, drop=drop, collection=collection) # when band indices are not required, reduce the # dataset to a 2d array through reduce function if calc_indices is None: if len(ds.time.values) > 1: if reduce_func == 'geomedian': data = GeoMedian().compute(ds) elif reduce_func in ['mean', 'median', 'std', 'max', 'min']: method_to_call = getattr(ds, reduce_func) data = method_to_call('time') else: data = ds.squeeze() if zonal_stats is None: # If no zonal stats were requested then extract all pixel values flat_train = sklearn_flatten(data) # Make a labelled array of identical size flat_val = np.repeat(row[field], flat_train.shape[0]) stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train)) elif zonal_stats in ['mean', 'median', 'std', 'max', 'min']: method_to_call = getattr(data, zonal_stats) flat_train = method_to_call() flat_train = flat_train.to_array() stacked = np.hstack((row[field], flat_train)) else: raise Exception( zonal_stats + " is not one of the supported" + " reduce functions ('mean','median','std','max','min')") # Append training data and labels to list out_arrs.append(stacked) out_vars.append([field] + list(data.data_vars))
def get_training_data_for_shp(path, out, product, time, crs='EPSG:3577', field='classnum', calc_indices=None, feature_stats=None, collection='ga_ls_2'): """ Function to extract data for training classifier using a shapefile of labelled polygons. Currently works for single time steps. Parameters ---------- path : string Path to shapefile containing labelled polygons. out : list Empty list to contain output data. product : string String of product name from which to load and extract datacube data e.g. 'ls8_nbart_tmad_annual' time : tuple A tuple containing the time period from which to extract training data e.g. ('2015-01-01', '2015-12-31'). crs : string A string containing desired crs e.g. 'EPSG:3577' field : string A string containing name of column with labels in shapefile attribute table. Field must contain numeric values. calc_indices: list, optional An optional list giving the names of any remote sensing indices to be calculated on the loaded data (e.g. `['NDWI', 'NDVI']`. This step will be skipped if any of the indices cannot be computed on the input product. feature_stats: string, optional An optional string giving the names of statistics to calculate for the polygon. Default is None (all pixel values). Supported values are 'mean' or 'geomedian' (from the `hdstats` module). Returns -------- A list of numpy.arrays containing classes and extracted data for each pixel or polygon. """ # Import hdstats as only needed for this function if feature_stats == 'geomedian': try: import hdstats except ImportError as err: raise raise ImportError( 'Can not import hdstats module needed to calculate' ' geomedian.\n{}'.format(err)) dc = datacube.Datacube(app='training_data') query = {'time': time} query['crs'] = crs shp = gp.read_file(path) bounds = shp.total_bounds minx = bounds[0] maxx = bounds[2] miny = bounds[1] maxy = bounds[3] query['x'] = (minx, maxx) query['y'] = (miny, maxy) print("Loading data...") data = dc.load(product=product, group_by='solar_day', **query) # Check if geomedian is in the product and if indices are wanted if calc_indices is not None: try: print("Calculating indices...") # Calculate indices - will use for all features for index in calc_indices: data = dea_bandindices.calculate_indices(data, index, collection=collection) except ValueError: print( "Input dataset not suitable for selected indices, just extracting product data" ) pass # Remove time step if present try: data = data.isel(time=0) # Don't worry if it isn't except ValueError: pass print("Rasterizing features and extracting data...") # Initialize counter for status messages. i = 0 # Go through each feature for poly_geom, poly_class_id in zip(shp.geometry, shp[field]): print(" Feature {:04}/{:04}\r".format(i + 1, len(shp.geometry)), end='') # Rasterise the feature mask = rasterize([(poly_geom, poly_class_id)], out_shape=(data.y.size, data.x.size), transform=data.affine) # Convert mask from numpy to DataArray mask = xr.DataArray(mask, coords=(data.y, data.x)) # Mask out areas that were not within the labelled feature data_masked = data.where(mask == poly_class_id, np.nan) if feature_stats is None: # If no summary stats were requested then # extract all pixel values flat_train = sklearn_flatten(data_masked) # Make a labelled array of identical size flat_val = np.repeat(poly_class_id, flat_train.shape[0]) stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train)) elif feature_stats == 'mean': # For the mean of each polygon take the mean over all # axis, ignoring masked out values (nan). # This gives a single pixel value for each band flat_train = data_masked.mean(axis=None, skipna=True) flat_train = flat_train.to_array() stacked = np.hstack((poly_class_id, flat_train)) elif feature_stats == 'geomedian': # For the geomedian flatten so have a 2D array with # bands and pixel values. Then use hdstats # to calculate the geomedian flat_train = sklearn_flatten(data_masked) flat_train_median = hdstats.geomedian(flat_train, axis=0) # Geomedian will return a single value for each band so join # this with class id to create a single row in output stacked = np.hstack((poly_class_id, flat_train_median)) # Append training data and label to list out.append(stacked) # Update status counter (feature number) i = i + 1 # Return a list of labels for columns in output array return [field] + list(data.data_vars)
def dNBR_processing(coordinates): # Load all data in baseline period available from s2a/b_ard_granule datasets prefire_ard = load_ard( dc=dc, products=['s2a_ard_granule', 's2b_ard_granule'], x=(coordinates.x - 0.1, coordinates.x + 0.1), y=(coordinates.y - 0.1, coordinates.y + 0.1), time=(prefire_start, prefire_end), measurements=['nbart_nir_1', 'nbart_swir_3'], min_gooddata=0.1, output_crs='EPSG:32755', # UTM Zone 55S resolution=(-10, 10), group_by='solar_day') prefire_ard = calculate_indices(prefire_ard, index='NBR', collection='ga_s2_1', drop=False) # Compute median using all observations in the dataset along the time axis prefire_image = prefire_ard.median(dim='time') # Delete baseline_combined del prefire_ard # Select NBR prefire_NBR = prefire_image.NBR del prefire_image # Load all data in post-fire period available from s2a/b_ard_granule datasets postfire_ard = load_ard( dc=dc, products=['s2a_ard_granule', 's2b_ard_granule'], x=(coordinates.x - 0.1, coordinates.x + 0.1), y=(coordinates.y - 0.1, coordinates.y + 0.1), time=(postfire_start, postfire_end), measurements=['nbart_nir_1', 'nbart_swir_3'], min_gooddata=0.1, output_crs='EPSG:32755', # UTM Zone 55S resolution=(-10, 10), group_by='solar_day') # Calculate NBR on all post-fire images postfire_ard = calculate_indices(postfire_ard, index='NBR', collection='ga_s2_1', drop=False) # Calculate the median post-fire image postfire_image = postfire_ard.median(dim='time') del postfire_ard # Select NBR postfire_NBR = postfire_image.NBR del postfire_image # Calculate delta delta_NBR = prefire_NBR - postfire_NBR del prefire_NBR del postfire_NBR x = np.round_(coordinates.x, decimals=4) y = np.round_(coordinates.y, decimals=4) # Turn dNBR into a x-array dataset for export to GeoTIFF dnbr_dataset = delta_NBR.to_dataset(name='delta_NBR') # cog.write_cog(dnbr_dataset, './NBR_geotiffs/{x}_{y}_dNBR.tif') write_geotiff(f'/scratch/wj97/ab4513/dNBR_geotiffs/{x}_{y}_dNBR.tif', dnbr_dataset) del delta_NBR del dnbr_dataset