def test_mask_valid_data(): from xarray import DataArray, Dataset import numpy as np test_attrs = { 'one': 1, 'nodata': -999, } expected_data_array = DataArray(np.array( [[1., np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, np.nan]], dtype='float'), attrs=test_attrs, name='var_one') data_array = DataArray([[1, -999, -999], [2, 3, -999], [-999, -999, -999]], attrs=test_attrs) dataset = Dataset(data_vars={'var_one': data_array}, attrs={'ds_attr': 'still here'}) # Make sure test is actually changing something assert not data_array.equals(expected_data_array) output_ds = mask_invalid_data(dataset, keep_attrs=True) assert output_ds.attrs['ds_attr'] == 'still here' assert output_ds.data_vars['var_one'].equals(expected_data_array) assert output_ds.data_vars['var_one'].attrs['one'] == 1 output_da = mask_invalid_data(data_array, keep_attrs=True) assert output_da.equals(expected_data_array) assert output_da.attrs['one'] == 1
def plotIndex(data, b1, b2, p): bands = getBands(p) band1 = mask_invalid_data(data.data_vars[bands[b1]]) band2 = mask_invalid_data(data.data_vars[bands[b2]]) index = ((band1 - band2) / (band1 + band2)) return index
def remove_cloud_nodata(source_prod, data, mask_band): ls8_USGS_cloud_pixel_qa_value = [ 324, 352, 368, 386, 388, 392, 400, 416, 432, 480, 864, 880, 898, 900, 904, 928, 944, 992, 1350 ] non_ls8_USGS_cloud_pixel_qa_value = [ 72, 96, 112, 130, 132, 136, 144, 160, 176, 224 ] non_ls8_USGS_sr_cloud_qa_value = [2, 4, 12, 20, 34, 36, 52] mask_data = data[mask_band] nodata_value = mask_data.nodata nodata_cloud_value = [] if 'usgs' in source_prod: if 'ls8' in source_prod: nodata_cloud_value = ls8_USGS_cloud_pixel_qa_value else: if mask_band == 'sr_cloud_qa': nodata_cloud_value = non_ls8_USGS_sr_cloud_qa_value else: nodata_cloud_value = non_ls8_USGS_cloud_pixel_qa_value nodata_cloud_value.append(nodata_value) nodata_cloud = np.isin(mask_data, nodata_cloud_value) cld_free = data.where(~nodata_cloud).dropna(dim='time', how='all') else: cld_free = data.where(mask_data == 1).dropna(dim='time', how='all') # remove nodata for the pixel of interest cld_free_valid = masking.mask_invalid_data(cld_free) return cld_free_valid
def sensible_mask_invalid_data(data): # TODO This should be pushed up to datacube-core # xarray.DataArray.where() converts ints to floats, since NaNs are used to represent nodata # by default, this uses float64, which is way over the top for an int16 value, so # lets convert to float32 first, to save a bunch of memory. data = _convert_to_floats( data) # This is stripping out variable attributes return mask_invalid_data(data)
def getDataset(time, poly, crs): fetch_ds = combined_ls_sref.query(dc, geopolygon=poly, time=time) grouped_ds = combined_ls_sref.group(fetch_ds, resolution=(-30, 30), output_crs='EPSG:{}'.format(crs)) ds = combined_ls_sref.fetch(grouped_ds) ds = ds.sortby('time') ds = mask_invalid_data(ds) ds = ds.dropna('time', how='all') return(ds)
def plotRGB(data): fake_saturation = 40000 # Sets all `nodata` values to ``nan``. data = mask_invalid_data(data) # Isolate the color dimension in an xarray.DataArray, use transpose to make color the last dimension rgb = (data.to_array(dim='color')) rgb = rgb.transpose(*(rgb.dims[1:] + rgb.dims[:1])) # Filter out pixels where any band is 'saturated' rgb = rgb.where((rgb <= fake_saturation).all(dim='color')) # Scale to [0, 1] range for imshow rgb /= fake_saturation return rgb
def time_series(query, fp): """Returns muliple images with R,G,B values mapped to measurements parameter :param dict query: x (or longitude), y (or latitude), time :param file object params: optional file object to save plots are other bulky files :return: raw HTTP response (json or image/*) """ # keep those imports here to avoid breaking the rest of the file when these # libraries do not exist import matplotlib # pyplot will dry to plot on an X11 Display without this: matplotlib.use('Agg') import matplotlib.pyplot as plt import datacube from datacube.storage.masking import mask_invalid_data if 'granule' in DATASET['product']: query['resolution'] = (-0.000135, 0.000135) query['output_crs'] = 'EPSG:4326' dc = datacube.Datacube(env=DATASET['env'], app="ndvi_time_series") data = dc.load(DATASET['product'], **query) data = mask_invalid_data(data) rgb = data.to_array(dim='color') fake_saturation = 4000 rgb = rgb.transpose(*(rgb.dims[1:] + rgb.dims[:1])) # make 'color' the last dimension rgb = rgb.where((rgb <= fake_saturation).all( dim='color')) # mask out pixels where any band is 'saturated' rgb /= fake_saturation # scale to [0, 1] range for imshow try: rgb.plot.imshow(x=data.crs.dimensions[1], y=data.crs.dimensions[0], col='time', col_wrap=5) except Exception as err: return error("Plotting failed: {}".format(err)) ############################ # save to supplied file object: plt.savefig(fp, dpi=150, format='jpg') plt.gcf().clear() # clear figure instead of combining new images with old size = fp.tell() return {'error': 0, 'mimetype': 'image/jpg', 'size': size}
def getDataset(crs, xmin, xmax, ymin, ymax): "Fetch all data for the given area." print("Fetching data...") fetch_ds = combined_ls_sref.query(dc, x=(xmin, xmax), y=(ymin, ymax), crs='EPSG:{}'.format(crs), time=('2009-01-01', '2011-12-31')) grouped_ds = combined_ls_sref.group(fetch_ds, resolution=(-30, 30), output_crs='EPSG:{}'.format(crs)) ds = combined_ls_sref.fetch(grouped_ds) ds = mask_invalid_data(ds) print("Done.") return (ds)
def load_ard(dc, products=None, min_gooddata=0.0, fmask_gooddata=[1, 4, 5], mask_pixel_quality=True, mask_invalid_data=True, ls7_slc_off=True, product_metadata=False, dask_chunks={'time': 1}, lazy_load=False, **dcload_kwargs): ''' Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel 2), and returns a single masked xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This can be used to extract clean time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `dea_plotting`. The proportion of good quality pixels is calculated by summing the pixels flagged as good quality in `fmask`. By default non-cloudy or shadowed land, snow and water pixels are treated as good quality, but this can be customised using the `fmask_gooddata` parameter. MEMORY ISSUES: For large data extractions, it can be advisable to set `mask_pixel_quality=False`. The masking step coerces all numeric values to float32 when NaN values are inserted into the array, potentially causing your data to use twice the memory. Be aware that the resulting arrays will contain invalid values which may affect future analyses. Last modified: September 2019 Parameters ---------- dc : datacube Datacube object The Datacube to connect to, i.e. `dc = datacube.Datacube()`. This allows you to also use development datacubes if required. products : list A list of product names to load data from. Valid options are ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat, ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near Real Time. min_gooddata : float, optional An optional float giving the minimum percentage of good quality pixels required for a satellite observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). fmask_gooddata : list, optional An optional list of fmask values to treat as good quality observations in the above `min_gooddata` calculation. The default is `[1, 4, 5]` which will return non-cloudy or shadowed land, snow and water pixels. Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the good data mask to all observations that were not filtered out for having less good quality pixels than `min_gooddata`. E.g. if `min_gooddata=0.99`, the filtered observations may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them out and sets them to NaN using the good data mask. This will convert numeric values to float32 which can cause memory issues, set to False to prevent this. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. These invalid values can be caused by missing data along the edges of scenes, or terrain effects (for NBAR-T). Setting `mask_invalid_data=True` will convert all numeric values to float32 when -999 values are replaced with NaN which can cause memory issues; set to False to prevent this. Defaults to True. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to True, which keeps all Landsat 7 observations > May 31 2003. product_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `product` variable that gives the name of the product that each observation in the time series came from (e.g. 'ga_ls5t_ard_3'). Defaults to False. dask_chunks : dict, optional An optional dictionary containing the coords and sizes you wish to create dask chunks over. Usually used in combination with `lazy_load=True` (see below). For example: `dask_chunks = {'x': 500, 'y': 500}` lazy_load : boolean, optional Setting this variable to True will delay the computation of the function until you explicitly run `ds.compute()`. If used in conjuction with `dask.distributed.Client()` this will allow for automatic parallel computation. **dcload_kwargs : A set of keyword arguments to `dc.load` that define the spatiotemporal query used to extract data. This can include `x`, `y`, `time`, `resolution`, `resampling`, `group_by`, `crs` etc, and can either be listed directly in the `load_ard` call (e.g. `x=(150.0, 151.0)`), or by passing in a query kwarg (e.g. `**query`). For a full list of possible options, see: https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html Returns ------- combined_ds : xarray Dataset An xarray dataset containing only satellite observations that contains greater than `min_gooddata` proportion of good quality pixels. ''' # Due to possible bug in xarray 0.13.0, define temporary function # which converts dtypes in a way that preserves attributes def astype_attrs(da, dtype=np.float32): ''' Loop through all data variables in the dataset, record attributes, convert to float32, then reassign attributes. If the data variable cannot be converted to float32 (e.g. for a non-numeric dtype like strings), skip and return the variable unchanged. ''' try: da_attr = da.attrs da = da.astype(dtype) da = da.assign_attrs(**da_attr) return da except ValueError: return da # Verify that products were provided if not products: raise ValueError("Please provide a list of product names " "to load data from. Valid options are: \n" "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] " "for Landsat, ['s2a_ard_granule', " "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or " "['s2a_nrt_granule', 's2b_nrt_granule'] for " "Sentinel 2 Near Real Time") # If `measurements` are specified but do not include fmask, add it if (('measurements' in dcload_kwargs) and ('fmask' not in dcload_kwargs['measurements'])): dcload_kwargs['measurements'].append('fmask') # Create a list to hold data for each product product_data = [] # Iterate through each requested product for product in products: try: # Load data including fmask band print(f'Loading {product} data') try: ds = dc.load(product=f'{product}', dask_chunks=dask_chunks, **dcload_kwargs) except KeyError as e: raise ValueError(f'Band {e} does not exist in this product. ' f'Verify all requested `measurements` exist ' f'in {products}') # Keep a record of the original number of observations total_obs = len(ds.time) # Remove Landsat 7 SLC-off observations if ls7_slc_off=False if not ls7_slc_off and product == 'ga_ls7e_ard_3': print(' Ignoring SLC-off observations for ls7') ds = ds.sel(time=ds.time < np.datetime64('2003-05-30')) # If no measurements are specified, `fmask` is given a # different name. If necessary, rename it: if 'oa_fmask' in ds: ds = ds.rename({'oa_fmask': 'fmask'}) # Identify all pixels not affected by cloud/shadow/invalid good_quality = ds.fmask.isin(fmask_gooddata) # The good data percentage calculation has to load in all `fmask` # data, which can be slow. If the user has chosen no filtering # by using the default `min_gooddata = 0`, we can skip this step # completely to save processing time if min_gooddata > 0.0: # Compute good data for each observation as % of total pixels data_perc = (good_quality.sum(axis=1).sum(axis=1) / (good_quality.shape[1] * good_quality.shape[2])) # Filter by `min_gooddata` to drop low quality observations ds = ds.sel(time=data_perc >= min_gooddata) print(f' Filtering to {len(ds.time)} ' f'out of {total_obs} observations') # Optionally apply pixel quality mask to observations remaining # after the filtering step above to mask out all remaining # bad quality pixels if mask_pixel_quality & (len(ds.time) > 0): print(' Applying pixel quality mask') # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory. # We need to do this by applying a custom function to every # variable in the dataset instead of using `.astype()`, due # to a possible bug in xarray 0.13.0 that drops attributes ds = ds.apply(astype_attrs, dtype=np.float32, keep_attrs=True) ds = ds.where(good_quality) # Optionally add satellite/product name as a new variable if product_metadata: ds['product'] = xr.DataArray([product] * len(ds.time), [('time', ds.time)]) # If any data was returned, add result to list if len(ds.time) > 0: product_data.append(ds.drop('fmask')) # If AttributeError due to there being no `fmask` variable in # the dataset, skip this product and move on to the next except AttributeError: print(f' No data for {product}') # If any data was returned above, combine into one xarray if (len(product_data) > 0): # Concatenate results and sort by time print(f'Combining and sorting data') combined_ds = xr.concat(product_data, dim='time').sortby('time') # Optionally filter to replace no data values with nans if mask_invalid_data: print(' Masking out invalid values') # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory. # We need to do this by applying a custom function to every # variable in the dataset instead of using `.astype()`, due # to a possible bug in xarray 0.13.0 that drops attributes combined_ds = combined_ds.apply(astype_attrs, dtype=np.float32, keep_attrs=True) combined_ds = masking.mask_invalid_data(combined_ds) # If `lazy_load` is True, return data as a dask array without # actually loading it in if lazy_load: print(f' Returning {len(combined_ds.time)} observations' ' as a dask array') return combined_ds else: print(f' Returning {len(combined_ds.time)} observations ') return combined_ds.compute() # If no data was returned: else: print('No data returned for query') return None
import datacube from datacube.storage.masking import mask_invalid_data query = { 'time': ('1990-01-01', '1991-01-01'), 'lat': (-35.2, -35.4), 'lon': (149.0, 149.2), } dc = datacube.Datacube(app='plot-rgb-recipe') data = dc.load(product='ls5_nbar_albers', measurements=['red', 'green', 'blue'], **query) data = mask_invalid_data(data) fake_saturation = 4000 rgb = data.to_array(dim='color') rgb = rgb.transpose(*(rgb.dims[1:] + rgb.dims[:1])) # make 'color' the last dimension rgb = rgb.where((rgb <= fake_saturation).all( dim='color')) # mask out pixels where any band is 'saturated' rgb /= fake_saturation # scale to [0, 1] range for imshow rgb.plot.imshow(x=data.crs.dimensions[1], y=data.crs.dimensions[0], col='time', col_wrap=5, add_colorbar=False)
def load_ard(dc, products=None, min_gooddata=0.0, fmask_gooddata=[1, 4, 5], mask_pixel_quality=True, mask_invalid_data=True, mask_contiguity=False, mask_dtype=np.float32, ls7_slc_off=True, product_metadata=False, **dcload_kwargs): ''' Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel 2), and returns a single masked xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This can be used to extract clean time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `dea_plotting`. The proportion of good quality pixels is calculated by summing the pixels flagged as good quality in `fmask`. By default non-cloudy or shadowed land, snow and water pixels are treated as good quality, but this can be customised using the `fmask_gooddata` parameter. Last modified: March 2020 Parameters ---------- dc : datacube Datacube object The Datacube to connect to, i.e. `dc = datacube.Datacube()`. This allows you to also use development datacubes if required. products : list A list of product names to load data from. Valid options are ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat, ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near Real Time (on the DEA Sandbox only). min_gooddata : float, optional An optional float giving the minimum percentage of good quality pixels required for a satellite observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). fmask_gooddata : list, optional An optional list of fmask values to treat as good quality observations in the above `min_gooddata` calculation. The default is `[1, 4, 5]` which will return non-cloudy or shadowed land, snow and water pixels. Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the good data mask to all observations that were not filtered out for having less good quality pixels than `min_gooddata`. E.g. if `min_gooddata=0.99`, the filtered observations may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them and sets them to NaN using the good data mask. This will convert numeric values to floating point values which can cause memory issues, set to False to prevent this. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. These invalid values can be caused by missing data along the edges of scenes, or terrain effects (for NBART). Be aware that masking out invalid values will convert all numeric values to floating point values when -999 values are replaced with NaN, which can cause memory issues. mask_contiguity : str or bool, optional An optional string or boolean indicating whether to mask out pixels missing data in any band (i.e. "non-contiguous" values). This can be important for generating clean composite datasets. The default is False, which will ignore non-contiguous values completely. If loading NBART data, set the parameter to: `mask_contiguity='nbart_contiguity'`. If loading NBAR data, specify `mask_contiguity='nbar_contiguity'` instead. Non-contiguous pixels will be set to NaN if `dtype='auto'`, or set to the data's native nodata value if `dtype='native'` (which can be useful for reducing memory). mask_dtype : numpy dtype, optional An optional parameter that controls the data type/dtype that layers are coerced to when when `mask_pixel_quality=True` or `mask_contiguity=True`. Defaults to `np.float32`, which uses approximately 1/2 the memory of `np.float64`. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to True, which keeps all Landsat 7 observations > May 31 2003. product_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `product` variable that gives the name of the product that each observation in the time series came from (e.g. 'ga_ls5t_ard_3'). Defaults to False. **dcload_kwargs : A set of keyword arguments to `dc.load` that define the spatiotemporal query used to extract data. This typically includes `measurements`, `x`, `y`, `time`, `resolution`, `resampling`, `group_by` and `crs`. Keyword arguments can either be listed directly in the `load_ard` call like any other parameter (e.g. `measurements=['nbart_red']`), or by passing in a query kwarg dictionary (e.g. `**query`). For a list of possible options, see the `dc.load` documentation: https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html Returns ------- combined_ds : xarray Dataset An xarray dataset containing only satellite observations that contains greater than `min_gooddata` proportion of good quality pixels. ''' # Due to possible bug in xarray 0.13.0, define temporary function # which converts dtypes in a way that preserves attributes def astype_attrs(da, dtype=np.float32): ''' Loop through all data variables in the dataset, record attributes, convert to a custom dtype, then reassign attributes. If the data variable cannot be converted to the custom dtype (e.g. trying to convert non-numeric dtype like strings to floats), skip and return the variable unchanged. This can be combined with `.where()` to save memory. By casting to e.g. np.float32, we prevent `.where()` from automatically casting to np.float64, using 2x the memory. np.float16 could be used to save even more memory (although this may not be compatible with all downstream applications). This custom function is required instead of using xarray's built-in `.astype()`, due to a bug in xarray 0.13.0 that drops attributes: https://github.com/pydata/xarray/issues/3348 ''' try: da_attr = da.attrs da = da.astype(dtype) da = da.assign_attrs(**da_attr) return da except ValueError: return da # To prevent modifications to dcload_kwargs being made by this # function remaining after the function is run (potentially causing # different results each time the function is run), first take a # deep copy of the dcload_kwargs object. dcload_kwargs = deepcopy(dcload_kwargs) # Determine if lazy loading is required lazy_load = 'dask_chunks' in dcload_kwargs # Warn user if they combine lazy load with min_gooddata if (min_gooddata > 0.0) & lazy_load: warnings.warn("Setting 'min_gooddata' percentage to > 0.0 " "will cause dask arrays to compute when " "loading pixel-quality data to calculate " "'good pixel' percentage. This can " "significantly slow the return of your dataset.") # Verify that products were provided, and that only Sentinel-2 or # only Landsat products are being loaded at the same time if not products: raise ValueError("Please provide a list of product names " "to load data from. Valid options are: \n" "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] " "for Landsat, ['s2a_ard_granule', " "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or " "['s2a_nrt_granule', 's2b_nrt_granule'] for " "Sentinel 2 Near Real Time") elif all(['ls' in product for product in products]): product_type = 'ls' elif all(['s2' in product for product in products]): product_type = 's2' else: raise ValueError("Loading both Sentinel-2 and Landsat data " "at the same time is currently not supported") # If `measurements` are specified but do not include fmask or # contiguity variables, add these to `measurements` to_drop = [] # store loaded var names here to later drop fmask_band = 'fmask' if 'measurements' in dcload_kwargs: if fmask_band not in dcload_kwargs['measurements']: dcload_kwargs['measurements'].append(fmask_band) to_drop.append(fmask_band) if (mask_contiguity and (mask_contiguity not in dcload_kwargs['measurements'])): dcload_kwargs['measurements'].append(mask_contiguity) to_drop.append(mask_contiguity) # If no `measurements` are specified, Landsat ancillary bands are loaded # with a 'oa_' prefix, but Sentinel-2 bands are not. As a work-around, # we need to rename the default contiguity and fmask bands if loading # Landsat data without specifying `measurements` elif product_type == 'ls': mask_contiguity = f'oa_{mask_contiguity}' if mask_contiguity else False fmask_band = f'oa_{fmask_band}' # Create a list to hold data for each product product_data = [] # Iterate through each requested product for product in products: try: # Load data including fmask band print(f'Loading {product} data') try: # If dask_chunks is specified, load data using query if lazy_load: ds = dc.load(product=f'{product}', **dcload_kwargs) # If no dask chunks specified, add this param so that # we can lazy load data before filtering by good data else: ds = dc.load(product=f'{product}', dask_chunks={}, **dcload_kwargs) except KeyError as e: raise ValueError(f'Band {e} does not exist in this product. ' f'Verify all requested `measurements` exist ' f'in {products}') # Keep a record of the original number of observations total_obs = len(ds.time) # Remove Landsat 7 SLC-off observations if ls7_slc_off=False if not ls7_slc_off and product == 'ga_ls7e_ard_3': print(' Ignoring SLC-off observations for ls7') ds = ds.sel(time=ds.time < np.datetime64('2003-05-31')) # Identify all pixels not affected by cloud/shadow/invalid good_quality = ds[fmask_band].isin(fmask_gooddata) # The good data percentage calculation has to load in all `fmask` # data, which can be slow. If the user has chosen no filtering # by using the default `min_gooddata = 0`, we can skip this step # completely to save processing time if min_gooddata > 0.0: # Compute good data for each observation as % of total pixels data_perc = (good_quality.sum(axis=1).sum(axis=1) / (good_quality.shape[1] * good_quality.shape[2])) # Filter by `min_gooddata` to drop low quality observations ds = ds.sel(time=data_perc >= min_gooddata) print(f' Filtering to {len(ds.time)} ' f'out of {total_obs} observations') # If any data was returned if len(ds.time) > 0: # Optionally apply pixel quality mask to observations # remaining after the filtering step above to mask out # all remaining bad quality pixels if mask_pixel_quality: print(' Applying pixel quality/cloud mask') # Change dtype to custom float before masking to # save memory. See `astype_attrs` func docstring # above for details ds = ds.apply(astype_attrs, dtype=mask_dtype, keep_attrs=True) ds = ds.where(good_quality) # Optionally filter to replace no data values with nans if mask_invalid_data: print(' Applying invalid data mask') # Change dtype to custom float before masking to # save memory. See `astype_attrs` func docstring # above for details ds = ds.apply(astype_attrs, dtype=mask_dtype, keep_attrs=True) ds = masking.mask_invalid_data(ds) # Optionally apply contiguity mask to observations to # remove pixels missing data in any band if mask_contiguity: print(' Applying contiguity mask') # Change dtype to custom float before masking to # save memory. See `astype_attrs` func docstring # above for details ds = ds.apply(astype_attrs, dtype=mask_dtype, keep_attrs=True) ds = ds.where(ds[mask_contiguity] == 1) # Optionally add satellite/product name as a new variable if product_metadata: ds['product'] = xr.DataArray([product] * len(ds.time), [('time', ds.time)]) # If any data was returned, add result to list product_data.append(ds.drop(to_drop)) # If no data is returned, print status else: print(f' No data for {product}') # If AttributeError due to there being no variables in # the dataset, skip this product and move on to the next except AttributeError: print(f' No data for {product}') # If any data was returned above, combine into one xarray if (len(product_data) > 0): # Concatenate results and sort by time print(f'Combining and sorting data') combined_ds = xr.concat(product_data, dim='time').sortby('time') # If `lazy_load` is True, return data as a dask array without # actually loading it in if lazy_load: print(f' Returning {len(combined_ds.time)} observations' ' as a dask array') return combined_ds else: print(f' Returning {len(combined_ds.time)} observations ') return combined_ds.compute() # If no data was returned: else: print('No data returned for query') return None
def load_clearsentinel2(dc, query, sensors=('s2a', 's2b'), product='ard', bands_of_interest=('nbart_red', 'nbart_green', 'nbart_blue', 'nbart_nir_1', 'nbart_swir_2', 'nbart_swir_3'), masked_prop=0.99, mask_values=(0, 2, 3), pixel_quality_band='fmask', mask_pixel_quality=False, mask_invalid_data=True, satellite_metadata=False): """ Loads Sentinel 2 data for multiple sensors (i.e. s2a, s2b), and returns a single xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This can be used to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of good quality pixels is calculated by summing the pixels that are not flagged as poor quality in the Sentinel pixel quality array. By default pixels flagged as nodata, cloud or shadow are used to calculate the number of poor quality pixels, but this can be customised using the `mask_values` parameter. MEMORY ISSUES: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float64 when NaN values are inserted into the array, potentially causing your data to use 4x as much memory. Be aware that the resulting arrays will contain invalid -999 values which should be considered in analyses. Last modified: November 2018 Author: Robbi Bishop-Taylor :param dc: A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Sentinel datacube')`. This allows you to also use development datacubes if they have been imported into the environment. :param query: A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all time steps available to all sensors (e.g. 2015 onward) :param sensors: An optional list of Sentinel 2 sensors to load data for. Options are 's2a', and 's2b'; defaults to both. :param product: An optional string specifying the product to load. Defaults to 'ard', which is equivalent to loading e.g. `s2a_ard_granule`. :param bands_of_interest: An optional list of strings containing the bands to be read in; to view full list run the following: `dc.list_measurements().loc['s2b_ard_granule']`. Defaults to `('nbart_red', 'nbart_green', 'nbart_blue', 'nbart_nir_1', 'nbart_swir_2', 'nbart_swir_3')`. :param masked_prop: An optional float giving the minimum percentage of good quality pixels required for a Sentinel 2 observation to be loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of poor quality pixels). :param mask_values: An optional list of pixel quality values to treat as poor quality observations in the above `masked_prop` calculation. The default is `[0, 2, 3]` which treats nodata, cloud and cloud shadow as poor quality. Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`. :param pixel_quality_band: An optional string giving the name of the pixel quality band contained in the Sentinel 2 dataset. The default value is 'fmask'. :param mask_pixel_quality: An optional boolean indicating whether to apply the pixel quality mask to all observations that were not filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them out and sets them to NaN using the pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to float64 which can cause memory issues. To reduce memory usage, set to False. :param mask_invalid_data: An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to True; this has the side effect of changing the data type of the output arrays from int16 to float64 which can cause memory issues. To reduce memory usage, set to False. :param satellite_metadata: An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name of the satellite that made each observation in the time series (i.e. s2a, s2b). Defaults to False. :returns: An xarray dataset containing only Sentinel 2 observations that contain greater than `masked_prop` proportion of clear pixels. :example: >>> # Import modules >>> import datacube >>> import sys >>> # Import dea-notebooks functions using relative link to 10_Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> # Connect to a datacube containing Sentinel data >>> dc = datacube.Datacube(app='load_clearsentinel') >>> # Set up spatial and temporal query; note that 'output_crs' and 'resolution' need to be set >>> query = {'x': (-191400.0, -183400.0), ... 'y': (-1423460.0, -1415460.0), ... 'time': ('2018-01-01', '2018-03-01'), ... 'crs': 'EPSG:3577', ... 'output_crs': 'EPSG:3577', ... 'resolution': (10, 10)} >>> # Load observations with less than 70% cloud from both S2A and S2B as a single combined dataset >>> sentinel_ds = DEADataHandling.load_clearsentinel2(dc=dc, query=query, sensors=['s2a', 's2b'], ... bands_of_interest=['nbart_red', 'nbart_green', 'nbart_blue'], ... masked_prop=0.3, mask_pixel_quality=True) Loading s2a pixel quality Loading 3 filtered s2a timesteps Loading s2b pixel quality Loading 2 filtered s2b timesteps Combining and sorting s2a, s2b data Replacing invalid -999 values with NaN (data will be coerced to float64) >>> # Test that function returned data >>> len(sentinel_ds.time) > 0 True """ # List to save results from each sensor and list to keep names of successfully processed sensors filtered_sensors = [] successfully_returned = [] # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: try: # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Sentinel 2 data using dask data = dc.load(product=f'{sensor}_{product}_granule', measurements=bands_of_interest, group_by='solar_day', dask_chunks={'time': 1}, **query) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Sentinel 2 data using dask data = dc.load(product=f'{sensor}_{product}_granule', group_by='solar_day', dask_chunks={'time': 1}, **query) # Load PQ data print(f'Loading {sensor} pixel quality') pq = dc.load(product=f'{sensor}_{product}_granule', measurements=[pixel_quality_band], group_by='solar_day', **query) # Identify pixels with valid data good_quality = np.isin(pq[pixel_quality_band], test_elements=mask_values, invert=True) good_quality = pq[pixel_quality_band].where(good_quality).notnull() # Compute good data for each observation as a percentage of total array pixels data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Sentinel 2 dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter by data_perc to drop low quality observations and finally import data using dask filtered = data.sel(time=data.data_perc >= masked_prop) print(f' Loading {len(filtered.time)} filtered {sensor} timesteps') filtered = filtered.compute() # Optionally apply pixel quality mask to all observations that were not dropped in previous step if mask_pixel_quality: filtered = filtered.where(good_quality) # Optionally add satellite name if satellite_metadata: filtered['satellite'] = xr.DataArray([sensor] * len(filtered.time), [('time', filtered.time)]) # Append result to list and add sensor name to list of successfully sensors filtered_sensors.append(filtered) successfully_returned.append(sensor) # Close datasets filtered = None good_quality = None data = None except: # If there is no data for sensor or if another error occurs: print(f' Skipping {sensor}; no valid data for query') # Concatenate all sensors into one big xarray dataset, and then sort by time sensor_string = ", ".join(successfully_returned) print(f'Combining and sorting {sensor_string} data') combined_ds = xr.concat(filtered_sensors, dim='time') combined_ds = combined_ds.sortby('time') # Optionally filter to replace invalid data values with nans if mask_invalid_data: print(' Replacing invalid -999 values with NaN (data will be coerced to float64)') combined_ds = masking.mask_invalid_data(combined_ds) # Return combined dataset return combined_ds
def load_clearlandsat(dc, query, sensors=('ls5', 'ls7', 'ls8'), product='nbart', bands_of_interest=None, masked_prop=0.99, mask_dict=None, mask_pixel_quality=False, mask_invalid_data=True, ls7_slc_off=False, satellite_metadata=False): """Load cloud-free data from multiple Landsat satellites as an xarray dataset Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This function can be used to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of clear pixels is calculated by summing the pixels that are not flagged as being poor quality in the Landsat PQ25 layer. By default only cloudy pixels or pixels that are missing data in any band are used to calculate the number of poor quality pixels, but this can be customised using the `mask_dict` parameter. Last modified: October 2018 Author: Robbi Bishop-Taylor, Bex Dunn Parameters ---------- dc : datacube Datacube object A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to also use development datacubes if they have been imported into the environment. query : dict A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all timesteps available to all sensors (e.g. 1987-2018) sensors : list, optional An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all. product : str, optional An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover' notebooks from DEA-notebooks. bands_of_interest : list, optional An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified. masked_prop : float, optional An optional float giving the minimum percentage of clear pixels required for a Landsat observation to be loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of poor quality pixels). mask_dict : dict, optional An optional dict of arguments to the `masking.make_mask` function that can be used to identify good/poor quality pixels from the PQ layer using alternative masking criteria. The default value of None masks out pixels flagged as cloud by either the ACCA or Fmask algorithms, or pixels that are missing data in any band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_fmask': 'no_cloud', 'contiguous': True}`. See the `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all possible options. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the pixel quality mask to all observations that were not filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them out and sets them to NaN using the pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to float64 which can cause memory issues. To reduce memory usage, set to False. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to True; this has the side effect of changing the data type of the output arrays from int16 to float64 which can cause memory issues. To reduce memory usage, set to False. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to False, which removes all Landsat 7 observations after May 31 2003. satellite_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. Returns ------- combined_ds : xarray Dataset An xarray dataset containing only Landsat observations that contain greater than `masked_prop` proportion of clear pixels. Notes ----- Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float64 when NaN values are inserted into the array, potentially causing your data to use 4x as much memory. Be aware that the resulting arrays will contain invalid -999 values which should be considered in analyses. Example ------- >>> # Import modules >>> import datacube >>> import sys >>> # Import dea-notebooks functions using relative link to 10_Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> # Connect to a datacube containing Landsat data >>> dc = datacube.Datacube(app='load_clearlandsat') >>> # Set up spatial and temporal query >>> query = {'x': (954163, 972163), ... 'y': (-3573891, -3555891), ... 'time': ('2011-06-01', '2013-06-01'), ... 'crs': 'EPSG:3577'} >>> # Load observations with less than 25% cloud from ls5, ls7 and ls8 as a single combined dataset >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], ... bands_of_interest=['red', 'green', 'blue'], ... masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True) Loading ls5 pixel quality Loading 4 filtered ls5 timesteps Loading ls7 pixel quality Loading 29 filtered ls7 timesteps Loading ls8 pixel quality Loading 3 filtered ls8 timesteps Combining and sorting ls5, ls7, ls8 data Replacing invalid -999 values with NaN (data will be coerced to float64) >>> # Test that function returned data >>> len(landsat_ds.time) > 0 True """ # List to save results from each sensor and list to keep names of successfully processed sensors filtered_sensors = [] successfully_returned = [] # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: try: # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', measurements=bands_of_interest, group_by='solar_day', dask_chunks={'time': 1}, **query) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', group_by='solar_day', dask_chunks={'time': 1}, **query) # Load PQ data pq = dc.load(product=f'{sensor}_pq_albers', group_by='solar_day', fuse_func=ga_pq_fuser, dask_chunks={'time': 1}, **query) # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False if not ls7_slc_off and sensor == 'ls7': print('Ignoring SLC-off observations for ls7') data = data.sel(time=data.time < np.datetime64('2003-05-30')) # Return only Landsat observations that have matching PQ data time = (data.time - pq.time).time data = data.sel(time=time) pq = pq.sel(time=time) # Load PQ data using dask print('Loading {} pixel quality'.format(sensor)) pq = pq.compute() # If a custom dict is provided for mask_dict, use these values to make mask from PQ if mask_dict: # Mask PQ using custom values by unpacking mask_dict **kwarg good_quality = masking.make_mask(pq.pixelquality, **mask_dict) else: # Identify pixels with no clouds in either ACCA for Fmask good_quality = masking.make_mask(pq.pixelquality, cloud_acca='no_cloud', cloud_fmask='no_cloud', contiguous=True) # Compute good data for each observation as a percentage of total array pixels data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Landsat dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter by data_perc to drop low quality observations and finally import data using dask filtered = data.sel(time=data.data_perc >= masked_prop) print(f' Loading {len(filtered.time)} filtered {sensor} timesteps') filtered = filtered.compute() # Optionally apply pixel quality mask to all observations that were not dropped in previous step if mask_pixel_quality: filtered = filtered.where(good_quality) # Optionally add satellite name variable if satellite_metadata: filtered['satellite'] = xr.DataArray([sensor] * len(filtered.time), [('time', filtered.time)]) # Append result to list and add sensor name to list of successfully sensors filtered_sensors.append(filtered) successfully_returned.append(sensor) # Close datasets filtered = None good_quality = None data = None pq = None except: # If there is no data for sensor or if another error occurs: print(f'Loading {sensor} pixel quality\n Skipping {sensor}; no valid data for query') # Concatenate all sensors into one big xarray dataset, and then sort by time sensor_string = ", ".join(successfully_returned) print(f'Combining and sorting {sensor_string} data') combined_ds = xr.concat(filtered_sensors, dim='time') combined_ds = combined_ds.sortby('time') # Optionally filter to replace no data values with nans if mask_invalid_data: print(' Replacing invalid -999 values with NaN (data will be coerced to float64)') combined_ds = masking.mask_invalid_data(combined_ds) # Return combined dataset return combined_ds
def runAll(num_bands, args): """Run on all tiles in the specified datasets/area. Keys are based on the last dataset listed.""" global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 dc = datacube.Datacube() # Create Gridworkflow object for most recent dataset gw = GridWorkflow(dc.index, product=args.input_products[-1]) # Get list of cell keys for most recent dataset keys = list( gw.list_cells(product=args.input_products[-1], lat=(args.lowerlat, args.upperlat), lon=(args.lowerlon, args.upperlon)).keys()) dc.close() # Run on each key/tile in turn for key in keys: ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadAll(args.input_products, key, args.bands) if (input_ds): if (args.tmask_products): tmask_ds = loadAll(args.tmask_products, key, ['green', 'nir', 'swir1']) if (args.cloud_products): cloud_ds = loadAll(args.cloud_products, key, ['cloud_mask']) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) # We want to process each pixel seperately for i in range(len(input_data.x)): for j in range(len(input_data.y)): input_ts = input_data.isel(x=i, y=j) # Get just one pixel x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray( input_ts ) # Transform the time series into a numpy array if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=i, y=j ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[ cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=i, y=j) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes for this key with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name for this key output_file = os.path.join( args.outdir, "{}_{}_{}.csv".format(args.output_file, key[0], key[1])) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows) # Reset shared list rows = []
def runByTile(key, num_bands, args): """Lets you process data using cell keys and x/y extent. A key represent one cell/area. Each cell has a tile for each time point. The x and y values define the extent of the tile that should be loaded and processed.""" global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadByTile(args.input_products, key, args.tile_y_min, args.tile_y_max, args.tile_x_min, args.tile_x_max, args.bands) if (input_ds): # Check that there is actually some input data if (args.tmask_products ): # If tmask should be used to screen for outliers tmask_ds = loadByTile(args.tmask_products, key, args.tile_y_min, args.tile_y_max, args.tile_x_min, args.tile_x_max, ['green', 'nir', 'swir1']) if (args.cloud_products): cloud_ds = loadByTile(args.cloud_products, key, args.tile_y_min, args.tile_y_max, args.tile_x_min, args.tile_x_max, ['cloud_mask']) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) del input_ds if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) for i in range(len(input_data.x)): for j in range(len(input_data.y)): input_ts = input_data.isel(x=i, y=j) x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray( input_ts) # Transform to Numpy array, sort and remove NaNs # Check that the input data has at least 1 row and the right number of columns if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=i, y=j ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=i, y=j) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name output_file = os.path.join(args.outdir, "{}.csv".format(args.output_file)) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows)
def load_clearlandsat(dc, query, sensors=('ls5', 'ls7', 'ls8'), product='nbart', dask_chunks={'time': 1}, lazy_load=False, bands_of_interest=None, masked_prop=0.0, mask_dict=None, mask_pixel_quality=True, mask_invalid_data=True, ls7_slc_off=False, satellite_metadata=False): """Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This function can be used to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of clear pixels is calculated by summing the pixels that are marked as being good quality in the Landsat PQ25 layer. By default cloud, cloud shadow, saturated pixels and pixels missing data for any band are considered poor quality data, but this can be customised using the `mask_dict` parameter. Last modified: March 2019 Author: Robbi Bishop-Taylor, Bex Dunn Parameters ---------- dc : datacube Datacube object A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to also use development datacubes if they have been imported into the environment. query : dict A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all timesteps available to all sensors (e.g. 1987-2018) sensors : list, optional An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all. product : str, optional An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover' notebooks from DEA-notebooks. dask_chunks : dict, optional An optional dictionary containing the coords and sizes you wish to create dask chunks over. Usually used in combination with lazy_load=True (see below). example: dask_chunks = {'x': 500, 'y': 500} lazy_load : boolean, optional Setting this variable to 'True' will delay the computation of the function until you explicitly run ds.compute(). If used in conjuction with dask.distributed.Client() will allow for automatic parallel computation. bands_of_interest : list, optional An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified. masked_prop : float, optional An optional float giving the minimum percentage of good quality pixels required for a Landsat observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). mask_dict : dict, optional An optional dict of arguments to the `masking.make_mask` function that can be used to identify poor quality pixels from the PQ layer using alternative masking criteria. The default value of None masks out pixels flagged as cloud or cloud shadow by either the ACCA or Fmask algorithms, any saturated pixels, or any pixels that are missing data in any band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_shadow_acca': 'no_cloud_shadow', 'cloud_shadow_fmask': 'no_cloud_shadow', 'cloud_fmask': 'no_cloud', 'blue_saturated': False, 'green_saturated': False, 'red_saturated': False, 'nir_saturated': False, 'swir1_saturated': False, 'swir2_saturated': False, 'contiguous': True}`. See the `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all possible options. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the pixel quality mask to all observations that were not filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them out and sets them to NaN using the pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to float32 which can cause memory issues. To reduce memory usage, set to False. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to True; this has the side effect of changing the data type of the output arrays from int16 to float32 which can cause memory issues. To reduce memory usage, set to False. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to False, which removes all Landsat 7 observations after May 31 2003. satellite_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. Returns ------- combined_ds : xarray Dataset An xarray dataset containing only Landsat observations that contain greater than `masked_prop` proportion of clear pixels. Notes ----- Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float32 when NaN values are inserted into the array, potentially causing your data to use 2x as much memory. Be aware that the resulting arrays will contain invalid -999 values which should be considered in analyses. Example ------- >>> # Import modules >>> import datacube >>> import sys >>> # Import dea-notebooks functions using relative link to 10_Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> # Connect to a datacube containing Landsat data >>> dc = datacube.Datacube(app='load_clearlandsat') >>> # Set up spatial and temporal query >>> query = {'x': (954163, 972163), ... 'y': (-3573891, -3555891), ... 'time': ('2011-06-01', '2013-06-01'), ... 'crs': 'EPSG:3577'} >>> # Load observations with more than 75% good quality pixels from ls5, ls7 and ls8 as a combined dataset >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], ... bands_of_interest=['red', 'green', 'blue'], ... masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True) Loading ls5 Loading 4 filtered ls5 timesteps Loading ls7 Loading 29 filtered ls7 timesteps Loading ls8 Loading 3 filtered ls8 timesteps Combining and sorting ls5, ls7, ls8 data Replacing invalid -999 values with NaN (data will be coerced to float32) >>> # Test that function returned data >>> len(landsat_ds.time) > 0 True """ ####################### # Process each sensor # ####################### #warn if loading a pq bitstring product and attempting to mask it (and therefore cast to float) if product == 'pq' and (mask_invalid_data or mask_pixel_quality): warnings.warn( """You are attempting to load pixel quality product with a mask flag (mask_invalid_data or mask_pixel_quality). Pixel quality is a bitstring (only makes sense as int) and masking casts to float32.""") # Dictionary to save results from each sensor filtered_sensors = {} # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: # Load PQ data using dask print(f'Loading {sensor}') # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', measurements=bands_of_interest, group_by='solar_day', dask_chunks=dask_chunks, **query) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Landsat data using dask data = dc.load(product=f'{sensor}_{product}_albers', group_by='solar_day', dask_chunks=dask_chunks, **query) # Load PQ data pq = dc.load(product=f'{sensor}_pq_albers', group_by='solar_day', fuse_func=ga_pq_fuser, dask_chunks=dask_chunks, **query) # If resulting dataset has data, continue: if data.variables: # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False if not ls7_slc_off and sensor == 'ls7': print(' Ignoring SLC-off observations for ls7') data = data.sel(time=data.time < np.datetime64('2003-05-30')) # If more than 0 timesteps if len(data.time) > 0: # Return only Landsat observations that have matching PQ data time = (data.time - pq.time).time data = data.sel(time=time) pq = pq.sel(time=time) # If a custom dict is provided for mask_dict, use these values to make mask from PQ if mask_dict: # Mask PQ using custom values by unpacking mask_dict **kwarg good_quality = masking.make_mask(pq.pixelquality, **mask_dict) else: # Identify pixels with no clouds in either ACCA for Fmask good_quality = masking.make_mask( pq.pixelquality, cloud_acca='no_cloud', cloud_shadow_acca='no_cloud_shadow', cloud_shadow_fmask='no_cloud_shadow', cloud_fmask='no_cloud', blue_saturated=False, green_saturated=False, red_saturated=False, nir_saturated=False, swir1_saturated=False, swir2_saturated=False, contiguous=True) # Compute good data for each observation as a percentage of total array pixels. Need to # sum over x and y axes individually so that the function works with lat-lon dimensions, # and because it isn't currently possible to pass a list of axes (bug with xarray?) data_perc = good_quality.sum(axis=1).sum( axis=1) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Landsat dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter by data_perc to drop low quality observations and finally import data using dask filtered = data.sel(time=data.data_perc >= masked_prop) print( f' Loading {len(filtered.time)} filtered {sensor} timesteps' ) # Optionally apply pixel quality mask to all observations that were not dropped in previous step if mask_pixel_quality: # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version filtered = filtered.astype( np.float32).assign_attrs(crs=filtered.crs) filtered = filtered.where(good_quality) # Optionally add satellite name variable if satellite_metadata: filtered['satellite'] = xr.DataArray( [sensor] * len(filtered.time), [('time', filtered.time)]) # Add result to dictionary if lazy_load == True: filtered_sensors[sensor] = filtered else: filtered_sensors[sensor] = filtered.compute() # Close datasets filtered = None good_quality = None data = None pq = None else: # If there is no data for sensor or if another error occurs: print(f' Skipping {sensor}; no valid data for query') else: # If there is no data for sensor or if another error occurs: print(f' Skipping {sensor}; no valid data for query') ############################ # Combine multiple sensors # ############################ # Proceed with concatenating only if there is more than 1 sensor processed if len(filtered_sensors) > 1: # Concatenate all sensors into one big xarray dataset, and then sort by time sensor_string = ", ".join(filtered_sensors.keys()) print(f'Combining and sorting {sensor_string} data') combined_ds = xr.concat(filtered_sensors.values(), dim='time') combined_ds = combined_ds.sortby('time') # Optionally filter to replace no data values with nans if mask_invalid_data: print( ' Replacing invalid -999 values with NaN (data will be coerced to float32)' ) # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version combined_ds = (combined_ds.astype( np.float32).assign_attrs(crs=combined_ds.crs)) combined_ds = masking.mask_invalid_data(combined_ds) # reset pixel quality attributes if product == 'pq': combined_ds.pixelquality.attrs.update( list(filtered_sensors.values())[0].pixelquality.attrs) # Return combined dataset return combined_ds # Return the single dataset if only one sensor was processed elif len(filtered_sensors) == 1: sensor_string = ", ".join(filtered_sensors.keys()) print(f'Returning {sensor_string} data') sensor_ds = list(filtered_sensors.values())[0] # Optionally filter to replace no data values with nans if mask_invalid_data: print( ' Replacing invalid -999 values with NaN (data will be coerced to float32)' ) # First change dtype to float32, then mask out values using # `.where()`. By casting to float32, we prevent `.where()` # from automatically casting to float64, using 2x the memory # We also need to manually reset attributes due to a possible # bug in recent xarray version sensor_ds = (sensor_ds.astype( np.float32).assign_attrs(crs=sensor_ds.crs)) sensor_ds = masking.mask_invalid_data(sensor_ds) return sensor_ds else: print( f'No data returned for query for any sensor in {", ".join(sensors)} ' f'and time range {"-".join(query["time"])}')
def load_nbarx(dc, sensor, query, product='nbart', bands_of_interest='', filter_pq=True): """ Loads NBAR (Nadir BRDF Adjusted Reflectance) or NBAR-T (terrain corrected NBAR) data for a sensor, masks using pixel quality (PQ), then optionally filters out terrain -999s (for NBAR-T). Returns an xarray dataset and CRS and Affine objects defining map projection and geotransform Last modified: May 2018 Author: Bex Dunn Modified by: Claire Krause, Robbi Bishop-Taylor, Bex Dunn inputs dc - Handle for the Datacube to import from. This allows you to also use dev environments if that have been imported into the environment. sensor - Options are 'ls5', 'ls7', 'ls8' query - A dict containing the query bounds. Can include lat/lon, time etc. optional product - 'nbar' or 'nbart'. Defaults to nbart unless otherwise specified bands_of_interest - List of strings containing the bands to be read in; defaults to all bands, options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2' filter_pq - boolean. Will filter clouds and saturated pixels using PQ unless set to False outputs ds - Extracted and optionally PQ filtered dataset crs - CRS object defining dataset coordinate reference system affine - Affine object defining dataset affine transformation """ product_name = '{}_{}_albers'.format(sensor, product) mask_product = '{}_{}_albers'.format(sensor, 'pq') print('Loading {}'.format(product_name)) # If bands of interest are given, assign measurements in dc.load call if bands_of_interest: ds = dc.load(product=product_name, measurements=bands_of_interest, group_by='solar_day', **query) # If no bands of interest given, run without specifying measurements else: ds = dc.load(product=product_name, group_by='solar_day', **query) # Proceed if the resulting call returns data if ds.variables: crs = ds.crs affine = ds.affine print('Loaded {}'.format(product_name)) # If pixel quality filtering is enabled, extract PQ data to use as mask if filter_pq: sensor_pq = dc.load(product=mask_product, fuse_func=ga_pq_fuser, group_by='solar_day', **query) # If PQ call returns data, use to mask input data if sensor_pq.variables: print('Generating mask {}'.format(mask_product)) good_quality = masking.make_mask(sensor_pq.pixelquality, cloud_acca='no_cloud', cloud_shadow_acca='no_cloud_shadow', cloud_shadow_fmask='no_cloud_shadow', cloud_fmask='no_cloud', blue_saturated=False, green_saturated=False, red_saturated=False, nir_saturated=False, swir1_saturated=False, swir2_saturated=False, contiguous=True) # Apply mask to preserve only good data ds = ds.where(good_quality) ds.attrs['crs'] = crs ds.attrs['affine'] = affine # Replace nodata values with nans ds = masking.mask_invalid_data(ds) return ds, crs, affine else: print('Failed to load {}'.format(product_name)) return None, None, None
for tile_index, tile in tile_list.items(): dataset = gw.load(tile[0:1, 400:401, 0:1], measurements=['red', 'nir', 'blue', 'green']) # 200ish/400ish if (dataset.variables): sref_ds.append(dataset) # Close datacube connection to database dc.close() # Concatenate the three datasets sref = xr.concat(sref_ds, dim='time') # Change nodata values (0's) to NaN sref = mask_invalid_data(sref) # We want to process each pixel seperately for i in range(len(sref.x)): for j in range(len(sref.y)): # Get the time series of observations for this pixel sref_ts = sref.isel(x=i, y=j) # Transform to pandas dataframe sref_data = transformToDf(sref_ts) # Drop rows with NA values in any column sref_data.dropna(axis=0, how='any', inplace=True) # Check columns weren't dropped
def load_ard( dc, products=None, min_gooddata=0.0, fmask_gooddata=[1, 4, 5], mask_pixel_quality=True, mask_invalid_data=True, mask_contiguity="nbart_contiguity", mask_dtype=np.float32, ls7_slc_off=True, product_metadata=False, **dcload_kwargs, ): """ Loads Landsat Collection 3 or Sentinel 2 Definitive and Near Real Time data for multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel 2), and returns a single masked xarray dataset containing only observations that contain greater than a given proportion of good quality pixels. This can be used to extract clean time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `dea_plotting`. The proportion of good quality pixels is calculated by summing the pixels flagged as good quality in `fmask`. By default non-cloudy or shadowed land, snow and water pixels are treated as good quality, but this can be customised using the `fmask_gooddata` parameter. Last modified: February 2020 Parameters ---------- dc : datacube Datacube object The Datacube to connect to, i.e. `dc = datacube.Datacube()`. This allows you to also use development datacubes if required. products : list A list of product names to load data from. Valid options are ['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] for Landsat, ['s2a_ard_granule', 's2b_ard_granule'] for Sentinel 2 Definitive, and ['s2a_nrt_granule', 's2b_nrt_granule'] for Sentinel 2 Near Real Time (on the DEA Sandbox only). min_gooddata : float, optional An optional float giving the minimum percentage of good quality pixels required for a satellite observation to be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 to return only observations with more than 99% good quality pixels). fmask_gooddata : list, optional An optional list of fmask values to treat as good quality observations in the above `min_gooddata` calculation. The default is `[1, 4, 5]` which will return non-cloudy or shadowed land, snow and water pixels. Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`. mask_pixel_quality : bool, optional An optional boolean indicating whether to apply the good data mask to all observations that were not filtered out for having less good quality pixels than `min_gooddata`. E.g. if `min_gooddata=0.99`, the filtered observations may still contain up to 1% poor quality pixels. The default of False simply returns the resulting observations without masking out these pixels; True masks them and sets them to NaN using the good data mask. This will convert numeric values to floating point values which can cause memory issues, set to False to prevent this. mask_invalid_data : bool, optional An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. These invalid values can be caused by missing data along the edges of scenes, or terrain effects (for NBART). Be aware that masking out invalid values will convert all numeric values to floating point values when -999 values are replaced with NaN, which can cause memory issues. mask_contiguity : str or bool, optional An optional string or boolean indicating whether to mask out pixels missing data in any band (i.e. "non-contiguous" values). Although most missing data issues are resolved by `mask_invalid_data`, this step is important for generating clean and concistent composite datasets. The default is `mask_contiguity='nbart_contiguity'` which will set any pixels with non-contiguous values to NaN based on NBART data. If you are loading NBAR data instead, you should specify `mask_contiguity='nbar_contiguity'` instead. To ignore non- contiguous values completely, set `mask_contiguity=False`. Be aware that masking out non-contiguous values will convert all numeric values to floating point values when -999 values are replaced with NaN, which can cause memory issues. mask_dtype : numpy dtype, optional An optional parameter that controls the data type/dtype that layers are coerced to when when `mask_pixel_quality=True` or `mask_contiguity=True`. Defaults to `np.float32`, which uses approximately 1/2 the memory of `np.float64`. ls7_slc_off : bool, optional An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to True, which keeps all Landsat 7 observations > May 31 2003. product_metadata : bool, optional An optional boolean indicating whether to return the dataset with a `product` variable that gives the name of the product that each observation in the time series came from (e.g. 'ga_ls5t_ard_3'). Defaults to False. **dcload_kwargs : A set of keyword arguments to `dc.load` that define the spatiotemporal query used to extract data. This typically includes `measurements`, `x`, `y`, `time`, `resolution`, `resampling`, `group_by` and `crs`. Keyword arguments can either be listed directly in the `load_ard` call like any other parameter (e.g. `measurements=['nbart_red']`), or by passing in a query kwarg dictionary (e.g. `**query`). For a list of possible options, see the `dc.load` documentation: https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html Returns ------- combined_ds : xarray Dataset An xarray dataset containing only satellite observations that contains greater than `min_gooddata` proportion of good quality pixels. """ # Due to possible bug in xarray 0.13.0, define temporary function # which converts dtypes in a way that preserves attributes def astype_attrs(da, dtype=np.float32): """ Loop through all data variables in the dataset, record attributes, convert to a custom dtype, then reassign attributes. If the data variable cannot be converted to the custom dtype (e.g. trying to convert non-numeric dtype like strings to floats), skip and return the variable unchanged. This can be combined with `.where()` to save memory. By casting to e.g. np.float32, we prevent `.where()` from automatically casting to np.float64, using 2x the memory. np.float16 could be used to save even more memory (although this may not be compatible with all downstream applications). This custom function is required instead of using xarray's built-in `.astype()`, due to a bug in xarray 0.13.0 that drops attributes: https://github.com/pydata/xarray/issues/3348 """ try: da_attr = da.attrs da = da.astype(dtype) da = da.assign_attrs(**da_attr) return da except ValueError: return da dcload_kwargs = deepcopy(dcload_kwargs) # Determine if lazy loading is required lazy_load = "dask_chunks" in dcload_kwargs # Warn user if they combine lazy load with min_gooddata if (min_gooddata > 0.0) & lazy_load: warnings.warn("Setting 'min_gooddata' percentage to > 0.0 " "will cause dask arrays to compute when " "loading pixel-quality data to calculate " "'good pixel' percentage. This can " "significantly slow the return of your dataset.") # Verify that products were provided, and that only Sentinel-2 or # only Landsat products are being loaded at the same time if not products: raise ValueError("Please provide a list of product names " "to load data from. Valid options are: \n" "['ga_ls5t_ard_3', 'ga_ls7e_ard_3', 'ga_ls8c_ard_3'] " "for Landsat, ['s2a_ard_granule', " "'s2b_ard_granule'] \nfor Sentinel 2 Definitive, or " "['s2a_nrt_granule', 's2b_nrt_granule'] for " "Sentinel 2 Near Real Time") elif all(["ls" in product for product in products]): pass elif all(["s2" in product for product in products]): pass else: raise ValueError("Loading both Sentinel-2 and Landsat data " "at the same time is currently not supported") # Create a list to hold data for each product product_data = [] # Iterate through each requested product for product in products: try: # Load data including fmask band print(f"Loading {product} data") try: # If dask_chunks is specified, load data using query if lazy_load: ds = dc.load(product=f"{product}", **dcload_kwargs) # If no dask chunks specified, add this param so that # we can lazy load data before filtering by good data else: ds = dc.load(product=f"{product}", dask_chunks={}, **dcload_kwargs) except KeyError as e: raise ValueError(f"Band {e} does not exist in this product. " f"Verify all requested `measurements` exist " f"in {products}") # Keep a record of the original number of observations total_obs = len(ds.time) print(total_obs) # Identify all pixels not affected by cloud/shadow/invalid good_quality = ds # If any data was returned if len(ds.time) > 0: # Optionally apply pixel quality mask to observations # remaining after the filtering step above to mask out # all remaining bad quality pixels if mask_pixel_quality: print(" Applying pixel quality/cloud mask") # Change dtype to custom float before masking to # save memory. See `astype_attrs` func docstring # above for details ds = ds.apply(astype_attrs, dtype=mask_dtype, keep_attrs=True) ds = ds.where(good_quality) # Optionally filter to replace no data values with nans if mask_invalid_data: print(" Applying invalid data mask") # Change dtype to custom float before masking to # save memory. See `astype_attrs` func docstring # above for details ds = ds.apply(astype_attrs, dtype=mask_dtype, keep_attrs=True) ds = masking.mask_invalid_data(ds) # If any data was returned, add result to list product_data.append(ds) # If no data is returned, print status else: print(f" No data for {product}") # If AttributeError due to there being no variables in # the dataset, skip this product and move on to the next except AttributeError: print(f" No data for {product}") # If any data was returned above, combine into one xarray if len(product_data) > 0: # Concatenate results and sort by time print(f"Combining and sorting data") combined_ds = xr.concat(product_data, dim="time").sortby("time") # If `lazy_load` is True, return data as a dask array without # actually loading it in if lazy_load: print(f" Returning {len(combined_ds.time)} observations" " as a dask array") return combined_ds else: print(f" Returning {len(combined_ds.time)} observations ") return combined_ds.compute() # If no data was returned: else: print("No data returned for query") return None
def load_clearlandsat(dc, query, sensors=['ls5', 'ls7', 'ls8'], bands_of_interest=None, product='nbart', masked_prop=0.99, mask_dict=None, apply_mask=False, ls7_slc_off=False): """ Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8), and returns a single xarray dataset containing only observations that contain greater than a given proportion of clear pixels. This function was designed to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of clear pixels is calculated by summing the pixels that are flagged as being problematic in the Landsat PQ25 layer. By default only cloudy pixels or pixels without valid data in every band are included in the calculation, but this can be customised using the `mask_dict` function. Last modified: August 2018 Author: Robbi Bishop-Taylor, Bex Dunn :param dc: A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to also use development datacubes if they have been imported into the environment. :param query: A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all timesteps available to all sensors (e.g. 1987-2018) :param sensors: An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all. :param product: An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, see the 'GettingStartedWithLandsat' or 'Introduction_to_Fractional_Cover' notebooks on DEA-notebooks. :param bands_of_interest: An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified. :param masked_prop: An optional float giving the minimum percentage of clear pixels required for a Landsat observation to be loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of unclear pixels). :param mask_dict: An optional dict of arguments to the `masking.make_mask` function that can be used to identify clear observations from the PQ layer using alternative masking criteria. The default value of None masks out pixels flagged as cloud by either the ACCA or Fmask alogorithms, and that have values for every band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 'cloud_fmask': 'no_cloud', 'contiguous': True}`. See the `Landsat5-7-8-PQ` notebook on DEA Notebooks for a list of all possible options. :param apply_mask: An optional boolean indicating whether resulting observations should have the PQ mask applied to filter out any remaining unclear cells. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% unclear/cloudy pixels. The default of False simply returns the resulting observations without masking out these pixels; True removes them using the mask. :param ls7_slc_off: An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to False, which removes all Landsat 7 observations after May 31 2003. :returns: An xarray dataset containing only Landsat observations that contain greater than `masked_prop` proportion of clear pixels. :example: >>> # Import modules >>> import datacube >>> import sys >>> >>> # Import dea-notebooks functions using relative link to Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> >>> # Define datacube to import from >>> dc = datacube.Datacube(app='Clear Landsat') >>> >>> # Set up spatial and temporal query >>> query = {'x': (-191400.0, -183400.0), >>> 'y': (-1423460.0, -1415460.0), >>> 'time': ('1998-01-01', '2003-01-01'), >>> 'crs': 'EPSG:3577'} >>> >>> # Load in red, green and blue bands for all clear Landsat observations with < 1% unclear values. >>> combined_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, >>> bands_of_interest=['red', 'green', 'blue'], >>> masked_prop=0.99) >>> combined_ds """ # List to save results from each sensor filtered_sensors = [] # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: try: # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Landsat data using dask data = dc.load(product = '{}_{}_albers'.format(sensor, product), measurements=bands_of_interest, group_by = 'solar_day', dask_chunks={'time': 1}, **query) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Landsat data using dask data = dc.load(product = '{}_{}_albers'.format(sensor, product), group_by = 'solar_day', dask_chunks={'time': 1}, **query) # Load PQ data pq = dc.load(product = '{}_pq_albers'.format(sensor), group_by = 'solar_day', fuse_func=ga_pq_fuser, dask_chunks={'time': 1}, **query) # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False if not ls7_slc_off and sensor == 'ls7': print('Ignoring SLC-off observations for ls7') data = data.where(data.time < np.datetime64('2003-05-30'), drop=True) # Return only Landsat observations that have matching PQ data time = (data.time - pq.time).time data = data.sel(time=time) pq = pq.sel(time=time) # Load PQ data using dask print('Loading {} PQ'.format(sensor)) pq = pq.compute() # If a custom dict is provided for mask_dict, use these values to make mask from PQ if mask_dict: # Mask PQ using custom values by unpacking mask_dict **kwarg good_quality = masking.make_mask(pq.pixelquality, **mask_dict) else: # Identify pixels with no clouds in either ACCA for Fmask good_quality = masking.make_mask(pq.pixelquality, cloud_acca='no_cloud', cloud_fmask='no_cloud', contiguous=True) # Compute good data for each observation as a percentage of total array pixels data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Landsat dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter and finally import data using dask filtered = data.where(data.data_perc >= masked_prop, drop=True) print(' Loading {} filtered {} timesteps'.format(len(filtered.time), sensor)) filtered = filtered.compute() # Optionally apply mask (instead of only filtering) if apply_mask: filtered = filtered.where(good_quality) # Append result to list filtered_sensors.append(filtered) # Close datasets filtered = None good_quality = None data = None pq = None except: # If there is no data for sensor or if another error occurs: print(' Skipping {}'.format(sensor)) # Concatenate all sensors into one big xarray dataset, and then sort by time print('Combining and sorting ls5, ls7 and ls8 data') combined_ds = xr.concat(filtered_sensors, dim='time') combined_ds = combined_ds.sortby('time') #Filter to replace no data values with nans combined_ds = masking.mask_invalid_data(combined_ds) # Return combined dataset return combined_ds
def runOnSubset(num_bands, args): """If the user chooses to run the algorithm on a random subsample of the data, this function is called. It divides the number of subsamples to be taken by the number of cells/keys for a product or products. It then runs CCDC on the appropriate number of pixels per cell to get an even spread of samples.""" global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 tile_size = 3500 # Size of real tiles - too big to fit in memory new_tile_size = 875 # New size - this will divide each tile into 16 dc = datacube.Datacube() # Create Gridworkflow object for most recent dataset gw = GridWorkflow(dc.index, product=args.input_products[-1]) # Get list of cell keys for most recent dataset keys = list(gw.list_cells(product=args.input_products[-1]).keys()) dc.close() num_keys = len(keys) # Calculate number of pixels to use from each cell num_subs = ( (tile_size / new_tile_size) * (tile_size / new_tile_size)) * num_keys # Get number of sub-tiles samples_per_cell = np.ceil(args.num_samples / num_subs).astype( int) # Get number of samples to be taken per sub-tile # Load data for each cell for key in keys: # Each tile needs to be divided into mini-tiles for x in range(0, tile_size, new_tile_size): # Division in x dimension for y in range(0, tile_size, new_tile_size): # Division in y dimension min_x = x max_x = x + new_tile_size min_y = y max_y = y + new_tile_size ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadByTile(args.input_products, key, min_y, max_y, min_x, max_x, args.bands) if (input_ds): if (args.tmask_products): tmask_ds = loadByTile(args.tmask_products, key, min_y, max_y, min_x, max_x, ['green', 'nir', 'swir1']) if (args.cloud_products): cloud_ds = loadByTile(args.cloud_products, key, min_y, max_y, min_x, max_x, ['cloud_mask']) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') # Do the same for TOA data if present - tmask_ds will be empty if no TOA data sets were specified if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) # We want to process a random subset of pixels for i in range(samples_per_cell): random_x = np.random.randint(0, new_tile_size) random_y = np.random.randint(0, new_tile_size) input_ts = input_data.isel( x=random_x, y=random_y) # Get just one pixel x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray( input_ts ) # Transform the time series into a numpy array if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=random_x, y=random_y ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[ cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=random_x, y=random_y) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Use multiprocessing to process all samples from this mini-tile # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name output_file = os.path.join( args.outdir, "{}_{}_{}_{}_{}_{}.csv".format(args.output_file, key, min_y, max_y, min_x, max_x)) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows) # Reset shared list rows = []
def load_clearsentinel(dc, query, sensors=['s2a', 's2b'], bands_of_interest=['red', 'green', 'blue'], product='ard', masked_prop=0.99, mask_values=[0, 2, 3], apply_mask=False, pixel_quality_band='pixel_quality'): """ Loads Sentinel 2 data for multiple sensors (i.e. s2a, s2b), and returns a single xarray dataset containing only observations that contain greater than a given proportion of clear pixels. This can be used to extract visually appealing time series of observations that are not affected by cloud, for example as an input to the `animated_timeseries` function from `DEAPlotting`. The proportion of clear pixels is calculated by summing the pixels that are flagged as being problematic in the Sentinel pixel quality array. By default pixels flagged as nodata, cloud or shadow are used to calculate the number of unclear pixels, but this can be customised using the `mask_values` function. Last modified: August 2018 Author: Robbi Bishop-Taylor :param dc: A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Sentinel datacube')`. This allows you to also use development datacubes if they have been imported into the environment. :param query: A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the function defaults to all timesteps available to all sensors (e.g. 2015 onward) :param sensors: An optional list of Sentinel 2 sensors to load data for. Options are 's2a', and 's2b'; defaults to both. :param product: An optional string specifying the product to load. Defaults to 'ard', which is equivelent to loading e.g. `s2a_ard_granule`. :param bands_of_interest: An optional list of strings containing the bands to be read in; options can include 'red', 'green', 'blue', 'nir1', etc, but these may vary depending on the database. Defaults to `['red', 'green', 'blue']`. :param masked_prop: An optional float giving the minimum percentage of clear pixels required for a Sentinel 2 observation to be loaded. Defaults to 0.99 (i.e. only return observations with less than 1% of unclear pixels). :param mask_values: An optional list of pixel quality values to treat as invalid or unclear observations in the above `masked_prop` calculation. The default is `[0, 2, 3]` which treats nodata, cloud and cloud shadow as unclear observations. Choose from: `{'0': 'nodata', '1': 'valid', '2': 'cloud', '3': 'shadow', '4': 'snow', '5': 'water'}`. :param apply_mask: An optional boolean indicating whether resulting observations should have the pixel_quality mask applied to mask out any remaining unclear cells. For example, if `masked_prop=0.99`, the filtered images may still contain up to 1% unclear/cloudy pixels. The default of False simply returns the resulting observations without masking out these pixels; True removes them using the mask. :param pixel_quality_band: An optional string giving the name of the pixel quality band contained in the Sentinel 2 dataset. The default value is 'pixel_quality', however the same band may also be referred to as 'fmask' in some databases. :returns: An xarray dataset containing only Sentinel 2 observations that contain greater than `masked_prop` proportion of clear pixels. :example: >>> # Import modules >>> import datacube >>> import sys >>> >>> # Import dea-notebooks functions using relative link to Scripts directory >>> sys.path.append('../10_Scripts') >>> import DEADataHandling >>> >>> # Connect to a datacube containing Sentinel data >>> s2dc = datacube.Datacube(config='/g/data/r78/dc_configs/sentinel2.conf') >>> >>> # Set up spatial and temporal query; note that 'output_crs' and 'resolution' need to be set >>> query = {'x': (-191400.0, -183400.0), >>> 'y': (-1423460.0, -1415460.0), >>> 'time': ('2017-01-01', '2018-01-01'), >>> 'crs': 'EPSG:3577', >>> 'output_crs': 'EPSG:3577', >>> 'resolution': (10, 10)} >>> >>> # Load in red, green, blue and NIR1 bands for Sentinel observations with < 1% unclear values. >>> # Here we use apply_mask=True to mask out any remaining unclear pixels with NaN. >>> sentinel_ds = DEADataHandling.load_clearsentinel(dc=s2dc, query=query, >>> bands_of_interest=['red', 'green', 'blue', 'nir1'], >>> masked_prop=0.01, apply_mask=True) """ # List to save results from each sensor filtered_sensors = [] # Iterate through all sensors, returning only observations with > mask_prop clear pixels for sensor in sensors: # If bands of interest are given, assign measurements in dc.load call. This is # for compatibility with the existing dea-notebooks load_nbarx function. if bands_of_interest: # Lazily load Landsat data using dask data = dc.load(product='{}_{}_granule'.format(sensor, product), measurements=bands_of_interest, group_by='solar_day', dask_chunks={'time': 1}, **query ) # If no bands of interest given, run without specifying measurements, and # therefore return all available bands else: # Lazily load Landsat data using dask data = dc.load(product='{}_{}_granule'.format(sensor, product), group_by='solar_day', dask_chunks={'time': 1}, **query ) # Load PQ data pq = dc.load(product = '{}_{}_granule'.format(sensor, product), measurements=[pixel_quality_band], group_by = 'solar_day', dask_chunks={'time': 1}, **query) # Load PQ data using dask print('Loading {} PQ'.format(sensor)) pq = pq.compute() # Identify pixels with valid data good_quality = np.isin(pq[pixel_quality_band], test_elements = mask_values, invert=True) good_quality = pq[pixel_quality_band].where(good_quality).notnull() # Compute good data for each observation as a percentage of total array pixels data_perc = good_quality.sum(dim=['x', 'y']) / (good_quality.shape[1] * good_quality.shape[2]) # Add data_perc data to Sentinel dataset as a new xarray variable data['data_perc'] = xr.DataArray(data_perc, [('time', data.time)]) # Filter and finally import data using dask filtered = data.where(data.data_perc >= masked_prop, drop=True) print(' Loading {} filtered {} timesteps'.format(len(filtered.time), sensor)) filtered = filtered.compute() # Optionally apply mask (instead of only filtering) if apply_mask: filtered = filtered.where(good_quality) # Append result to list filtered_sensors.append(filtered) # Close datasets filtered = None good_quality = None data = None # Concatenate all sensors into one big xarray dataset, and then sort by time print('Combining and sorting ls5, ls7 and ls8 data') combined_ds = xr.concat(filtered_sensors, dim='time') combined_ds = combined_ds.sortby('time') #Filter to replace no data values with nans combined_ds = masking.mask_invalid_data(combined_ds) # Return combined dataset return combined_ds
def runOnArea(num_bands, args): """If the user chooses to run the algorithm on the whole of the specified area, this function is called. This function will load the whole area specified by the user, and run the algorithm on each pixel.""" # Refers to Manager list object global rows # Calculate the right number of columns to be returned from the data cube input_num_cols = num_bands + 1 ccdc_args = [] input_ds = [] tmask_ds = [] cloud_ds = [] input_ds = loadArea(args.input_products, args.bands, args.lowerlat, args.upperlat, args.lowerlon, args.upperlon) if (len(input_ds) == len(args.input_products)): if (args.tmask_products): tmask_ds = loadArea(args.tmask_products, ['green', 'nir', 'swir1'], args.lowerlat, args.upperlat, args.lowerlon, args.upperlon) if (args.cloud_products): cloud_ds = loadArea(args.cloud_products, ['cloud_mask'], args.lowerlat, args.upperlat, args.lowerlon, args.upperlon) # Tidy up input data input_data = xr.concat(input_ds, dim='time') input_data = mask_invalid_data(input_data) if (cloud_ds): cloud_masks = xr.concat(cloud_ds, dim='time') if (tmask_ds): tmask_data = xr.concat(tmask_ds, dim='time') tmask_data = mask_invalid_data(tmask_data) for i in range(len(input_data.x)): for j in range(len(input_data.y)): input_ts = input_data.isel(x=i, y=j) x_val = float(input_ts.x) y_val = float(input_ts.y) input_ts = transformToArray(input_ts) if (input_ts.shape[0] > 0 and input_ts.shape[1] == input_num_cols): if (cloud_ds): cloud_ts = cloud_masks.isel( x=i, y=j ) # Get cloud mask values through time for this pixel cloud_ts = transformToArray(cloud_ts) cloud_ts = cloud_ts[np.isin( cloud_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = input_ts[cloud_ts[:, 1] == 0] # Do masking (0 value is clear) if (tmask_ds): tmask_ts = tmask_data.isel(x=i, y=j) tmask_ts = transformToArray(tmask_ts) tmask_ts = tmask_ts[np.isin( tmask_ts[:, 0], input_ts[:, 0] )] # Remove any rows which aren't in the SREF data input_ts = doTmask( input_ts, tmask_ts ) # Use Tmask to further screen the input data argslist = (input_ts, num_bands, x_val, y_val, args) ccdc_args.append(argslist) # Do some tidying up del input_data if (cloud_ds): del cloud_ds del cloud_masks if (tmask_ds): del tmask_ds del tmask_data # Run processes with Pool(processes=args.num_procs) as pool: pool.starmap(runCCDC, ccdc_args) # Generate output file name output_file = os.path.join(args.outdir, "{}.csv".format(args.output_file)) # Write headers to file headers = [ "x", "y", "band", "start_date", "end_date", "start_val", "end_val", "coeffs", "RMSE", "intercept", "alpha", "change_date", "magnitude" ] with open(output_file, 'w') as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(rows)