Example #1
0
def fmask_filter_c2(fmask):
    mask = np.zeros(fmask.shape, dtype=np.uint8)
    col2_nodata = masking.make_mask(fmask, nodata=True)
    col2_cloud = masking.make_mask(fmask, cloud_or_cirrus="cloud_or_cirrus")
    col2_cloud_shadow = masking.make_mask(fmask, cloud_shadow="cloud_shadow")

    mask[col2_cloud.values] += MASKED_CLOUD
    mask[col2_cloud_shadow.values] += MASKED_CLOUD_SHADOW
    mask[col2_nodata.values] = NO_DATA
    return mask
Example #2
0
 def create_mask(data, flags):
     if "or" in flags:
         fs = flags["or"]
         mask = None
         for f in fs.items():
             f = {f[0]: f[1]}
             if mask is None:
                 mask = make_mask(data, **f)
             else:
                 mask |= make_mask(data, **f)
     else:
         fs = flags if "and" not in flags else flags["and"]
         mask = make_mask(data, **fs)
     return mask
Example #3
0
    def to_mask(self, data, pq_data, extra_mask=None):
        date_count = len(data.coords["time"])
        if date_count > 1:
            mdh = self.get_multi_date_handler(date_count)
            if extra_mask is not None:
                extra_mask = mdh.collapse_mask(extra_mask)
            if pq_data is not None:
                pq_data = mdh.collapse_mask(pq_data)
        else:
            if extra_mask is not None:
                extra_mask = extra_mask.squeeze(dim="time", drop=True)
            if pq_data is not None:
                pq_data = pq_data.squeeze(dim="time", drop=True)

        result = extra_mask
        if pq_data is not None:
            for mask in self.masks:
                odc_mask = make_mask(pq_data, **mask.flags)
                mask_data = getattr(odc_mask, self.product.pq_band)
                if mask.invert:
                    mask_data = ~mask_data
                if result is None:
                    result = mask_data
                else:
                    result = result & mask_data
        return result
Example #4
0
 def single_date_make_mask(data, mask):
     pq_data = getattr(data, mask.band_name)
     if mask.flags:
         odc_mask = make_mask(pq_data, **mask.flags)
     else:
         odc_mask = pq_data == mask.enum
     odc_mask = odc_mask.squeeze(dim="time", drop=True)
     return odc_mask
def create_long_arrays(ldc, udc, lquery, lquery2):

    usgs_names = [
        'coastal_aerosol', 'blue', 'green', 'red', 'nir', 'swir1', 'swir2'
    ]

    ls8_temp = ldc.load(product='ls8_nbart_scene', **lquery)
    ls8_bigtemp = ldc.load(product='ls8_nbart_scene', **lquery2)

    ls8_usgs_temp = udc.load(product='ls8_usgs_l2c1',
                             measurements=usgs_names,
                             **lquery)
    ls8_usgs_bigtemp = udc.load(product='ls8_usgs_l2c1',
                                measurements=usgs_names,
                                **lquery2)

    ls8_pq = ldc.load(product='ls8_pq_scene', fuse_func=ga_pq_fuser, **lquery2)

    good_quality = masking.make_mask(ls8_pq.pqa,
                                     cloud_acca='no_cloud',
                                     cloud_fmask='no_cloud',
                                     cloud_shadow_acca='no_cloud_shadow',
                                     cloud_shadow_fmask='no_cloud_shadow',
                                     blue_saturated=False,
                                     green_saturated=False,
                                     red_saturated=False,
                                     nir_saturated=False,
                                     swir1_saturated=False,
                                     swir2_saturated=False,
                                     contiguous=True)
    ls8_array = ls8_temp.where(good_quality)
    ls8_bigarray = ls8_bigtemp.where(good_quality)

    ls8_usgs_array = ls8_usgs_temp.where(good_quality)
    ls8_usgs_bigarray = ls8_usgs_bigtemp.where(good_quality)

    ls8_array = ls8_array.rename({
        '1': 'coastal_aerosol',
        '2': 'blue',
        '3': 'green',
        '4': 'red',
        '5': 'nir',
        '6': 'swir1',
        '7': 'swir2'
    })
    ls8_bigarray = ls8_bigarray.rename({
        '1': 'coastal_aerosol',
        '2': 'blue',
        '3': 'green',
        '4': 'red',
        '5': 'nir',
        '6': 'swir1',
        '7': 'swir2'
    })

    return ls8_array, ls8_usgs_array, ls8_bigarray, ls8_usgs_bigarray
Example #6
0
 def create_mask(self, data):
     if self.values:
         mask = None
         for v in self.values:
             vmask = data == v
             if mask is None:
                 mask = vmask
             else:
                 mask |= vmask
     elif self.or_flags:
         mask = None
         for f in self.flags.items():
             f = {f[0]: f[1]}
             if mask is None:
                 mask = make_mask(data, **f)
             else:
                 mask |= make_mask(data, **f)
     else:
         mask = make_mask(data, **self.flags)
     return mask
Example #7
0
 def make_mask(self, data, mask):
     odc_mask = None
     for dt in data.coords["time"].values:
         tpqdata = getattr(data.sel(time=dt), mask.band_name)
         if mask.flags:
             dt_mask = make_mask(tpqdata, **mask.flags)
         else:
             dt_mask = tpqdata == mask.enum
         if odc_mask is None:
             odc_mask = dt_mask
         else:
             odc_mask |= dt_mask
     return odc_mask
Example #8
0
def compute_mosaic(products, measurements, **parsed_expressions):
    with Datacube() as dc:
        acq_range = parsed_expressions['time']
        click.echo("Processing time range {}".format(acq_range))
        datasets = []

        for prodname in products:
            dataset = dc.load(product=prodname,
                              measurements=measurements,
                              group_by='solar_day',
                              **parsed_expressions)
            if len(dataset) == 0:
                continue
            else:
                click.echo("Found {} time slices of {} during {}.".format(
                    len(dataset['time']), prodname, acq_range))

            pq = dc.load(product=prodname.replace('nbar', 'pq'),
                         group_by='solar_day',
                         fuse_func=ga_pq_fuser,
                         **parsed_expressions)

            if len(pq) == 0:
                click.echo('No PQ found, skipping')
                continue

            crs = dataset.attrs['crs']
            dataset = dataset.where(dataset != -999)
            dataset.attrs['product'] = prodname
            dataset.attrs['crs'] = crs

            cloud_free = make_mask(pq.pixelquality, ga_good_pixel=True)
            dataset = dataset.where(cloud_free)

            if len(dataset) == 0:
                click.echo("Nothing left after PQ masking")
                continue

            datasets.append(dataset)

    dataset = xr.concat(datasets, dim='time')

    return dataset.median(dim='time')
Example #9
0
def load_ard(dc,
             products=None,
             min_gooddata=0.0,
             pq_categories_s2=[
                 'vegetation', 'snow or ice', 'water', 'bare soils',
                 'unclassified', 'dark area pixels'
             ],
             pq_categories_ls=None,
             mask_pixel_quality=True,
             ls7_slc_off=True,
             predicate=None,
             dtype='auto',
             scaling='raw',
             **kwargs):
    '''
    Loads and combines Landsat Collections 1 or 2, and Sentinel-2 for 
    multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel-2), 
    optionally applies pixel quality masks, and drops time steps that 
    contain greater than a minimum proportion of good quality (e.g. non-
    cloudy or shadowed) pixels. 
    The function supports loading the following DE Africa products:
    
        ls5_usgs_sr_scene
        ls7_usgs_sr_scene
        ls8_usgs_sr_scene
        usgs_ls8c_level2_2
        ga_ls8c_fractional_cover_2
        s2_l2a

    Last modified: March 2020
    
    Parameters
    ----------
    dc : datacube Datacube object
        The Datacube to connect to, i.e. `dc = datacube.Datacube()`.
        This allows you to also use development datacubes if required.
    products : list
        A list of product names to load data from. Valid options are
        Landsat C1: ['ls5_usgs_sr_scene', 'ls7_usgs_sr_scene', 'ls8_usgs_sr_scene'],
        Landsat C2: ['usgs_ls8c_level2_2']
        Sentinel-2: ['s2_l2a']
    min_gooddata : float, optional
        An optional float giving the minimum percentage of good quality
        pixels required for a satellite observation to be loaded.
        Defaults to 0.0 which will return all observations regardless of
        pixel quality (set to e.g. 0.99 to return only observations with
        more than 99% good quality pixels).
    pq_categories_s2 : list, optional
        An optional list of Sentinel-2 Scene Classification Layer (SCL) names 
        to treat as good quality observations in the above `min_gooddata` 
        calculation. The default is ['vegetation','snow or ice','water',
        'bare soils','unclassified', 'dark area pixels'] which will return
        non-cloudy or non-shadowed land, snow, water, veg, and non-veg pixels.
    pq_categories_ls : dict, optional
        An optional dictionary that is used to generate a good quality 
        pixel mask from the selected USGS product's pixel quality band (i.e. 
        'pixel_qa' for USGS Collection 1, and 'quality_l2_aerosol' for
        USGS Collection 2). This mask is used for both masking out low
        quality pixels (e.g. cloud or shadow), and for dropping 
        observations entirely based on the above `min_gooddata` 
        calculation. Default is None, which will apply the following mask 
        for USGS Collection 1: `{'cloud': 'no_cloud', 'cloud_shadow': 
        'no_cloud_shadow', 'nodata': False}`, and for USGS Collection 2:
        `{'cloud_shadow': 'not_cloud_shadow', 'cloud_or_cirrus': 
        'not_cloud_or_cirrus', 'nodata': False}.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the good data
        mask to all observations that were not filtered out for having
        less good quality pixels than `min_gooddata`. E.g. if
        `min_gooddata=0.99`, the filtered observations may still contain
        up to 1% poor quality pixels. The default of False simply
        returns the resulting observations without masking out these
        pixels; True masks them and sets them to NaN using the good data
        mask. This will convert numeric values to floating point values
        which can cause memory issues, set to False to prevent this.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to
        True, which keeps all Landsat 7 observations > May 31 2003.
    predicate : function, optional
        An optional function that can be passed in to restrict the
        datasets that are loaded by the function. A filter function
        should take a `datacube.model.Dataset` object as an input (i.e.
        as returned from `dc.find_datasets`), and return a boolean.
        For example, a filter function could be used to return True on
        only datasets acquired in January:
        `dataset.time.begin.month == 1`
    dtype : string, optional
        An optional parameter that controls the data type/dtype that
        layers are coerced to after loading. Valid values: 'native', 
        'auto', 'float{16|32|64}'. When 'auto' is used, the data will be 
        converted to `float32` if masking is used, otherwise data will 
        be returned in the native data type of the data. Be aware that
        if data is loaded in its native dtype, nodata and masked 
        pixels will be returned with the data's native nodata value 
        (typically -999), not NaN. 
    scaling : str, optional
        If 'normalised', then surface reflectance values are scaled from
        their original values to 0-1.  If 'raw' then dataset is returned
        in its native scaling. WARNING: USGS Landsat Collection 2
        surface reflectance values have an offset so normliaed band indices 
        will return non-sensical results if setting scaling='raw'. 
    **kwargs :
        A set of keyword arguments to `dc.load` that define the
        spatiotemporal query used to extract data. This typically
        includes `measurements`, `x`, `y`, `time`, `resolution`,
        `resampling`, `group_by` and `crs`. Keyword arguments can
        either be listed directly in the `load_ard` call like any
        other parameter (e.g. `measurements=['nbart_red']`), or by
        passing in a query kwarg dictionary (e.g. `**query`). For a
        list of possible options, see the `dc.load` documentation:
        https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html
        
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only satellite observations that
        contains greater than `min_gooddata` proportion of good quality
        pixels.
        
    '''

    #########
    # Setup #
    #########
    # prevent function altering original query object
    kwargs = deepcopy(kwargs)

    # We deal with `dask_chunks` separately
    dask_chunks = kwargs.pop('dask_chunks', None)
    requested_measurements = kwargs.pop('measurements', None)

    # Warn user if they combine lazy load with min_gooddata
    if (min_gooddata > 0.0) and dask_chunks is not None:
        warnings.warn("Setting 'min_gooddata' percentage to > 0.0 "
                      "will cause dask arrays to compute when "
                      "loading pixel-quality data to calculate "
                      "'good pixel' percentage. This can "
                      "slow the return of your dataset.")

    # Verify that products were provided and determine if Sentinel-2
    # or Landsat data is being loaded
    if not products:
        raise ValueError(f'Please provide a list of product names '
                         f'to load data from.')

    elif all(['level2' in product for product in products]):
        product_type = 'c2'
    elif all(['sr' in product for product in products]):
        product_type = 'c1'
    elif all(['s2' in product for product in products]):
        product_type = 's2'
    elif all(['fractional_cover' in product for product in products]):
        product_type = 'fc'

    # If `measurements` are specified but do not include pixel quality bands,
    #  add these to `measurements` according to collection
    if (product_type == 'c2') or (product_type == 'fc'):
        print('Using pixel quality parameters for USGS Collection 2')
        fmask_band = 'quality_l2_aerosol'

    elif product_type == 'c1':
        print('Using pixel quality parameters for USGS Collection 1')
        fmask_band = 'pixel_qa'

    elif product_type == 's2':
        print('Using pixel quality parameters for Sentinel 2')
        fmask_band = 'SCL'

    measurements = requested_measurements.copy(
    ) if requested_measurements else None

    # Deal with "load all" case: pick a set of bands common across
    # all products
    if measurements is None:
        if product_type == 'fc':
            measurements = ['pv', 'npv', 'bs', 'ue']
        else:
            measurements = _common_bands(dc, products)

    # If `measurements` are specified but do not include pq, add.
    if measurements:
        #pass if FC
        if product_type == 'fc':
            pass
        else:
            if fmask_band not in measurements:
                measurements.append(fmask_band)

    # Get list of data and mask bands so that we can later exclude
    # mask bands from being masked themselves
    if product_type == 'fc':
        pass
    else:
        data_bands = [
            band for band in measurements if band not in (fmask_band)
        ]
        mask_bands = [band for band in measurements if band not in data_bands]

    #################
    # Find datasets #
    #################l

    # Pull out query params only to pass to dc.find_datasets
    query = _dc_query_only(**kwargs)

    # Extract datasets for each product using subset of dcload_kwargs
    dataset_list = []

    # Get list of datasets for each product
    print('Finding datasets')
    for product in products:

        # Obtain list of datasets for product
        print(f'    {product}')
        datasets = dc.find_datasets(product=product, **query)

        # Remove Landsat 7 SLC-off observations if ls7_slc_off=False
        if not ls7_slc_off and product in [
                'ls7_usgs_sr_scene', 'usgs_ls7e_level2_2'
        ]:
            print('    Ignoring SLC-off observations for ls7')
            datasets = [
                i for i in datasets if
                i.time.begin < datetime.datetime(2003, 5, 31, tzinfo=pytz.UTC)
            ]

        # Add any returned datasets to list
        dataset_list.extend(datasets)

    # Raise exception if no datasets are returned
    if len(dataset_list) == 0:
        raise ValueError("No data available for query: ensure that "
                         "the products specified have data for the "
                         "time and location requested")

    # If pedicate is specified, use this function to filter the list
    # of datasets prior to load
    if predicate:
        print(f'Filtering datasets using filter function')
        dataset_list = [ds for ds in dataset_list if predicate(ds)]

    # Raise exception if filtering removes all datasets
    if len(dataset_list) == 0:
        raise ValueError("No data available after filtering with "
                         "filter function")

    # load fmask from C2 for masking FC, and filter if required
    # NOTE: This works because only one sensor (ls8) has FC, if/when
    # FC is calculated for LS7, LS5, will need to move this section
    # into the for loop above.
    if product_type == 'fc':

        print('    PQ data from USGS C2')
        dataset_list_fc_pq = dc.find_datasets(product='usgs_ls8c_level2_2',
                                              **query)

        if predicate:
            print(f'Filtering datasets using filter function')
            dataset_list_fc_pq = [
                ds for ds in dataset_list_fc_pq if predicate(ds)
            ]

    #############
    # Load data #
    #############
    # Note we always load using dask here so that
    # we can lazy load data before filtering by good data
    ds = dc.load(datasets=dataset_list,
                 measurements=measurements,
                 dask_chunks={} if dask_chunks is None else dask_chunks,
                 **kwargs)

    if product_type == 'fc':
        ds_fc_pq = dc.load(
            datasets=dataset_list_fc_pq,
            dask_chunks={} if dask_chunks is None else dask_chunks,
            **kwargs)

    ####################
    # Filter good data #
    ####################

    # need to distinguish between products due to different
    # pq band properties

    # collection 2 USGS or FC
    if (product_type == 'c2') or (product_type == 'fc'):
        if pq_categories_ls is None:
            quality_flags_prod = {
                'cloud_shadow': 'not_cloud_shadow',
                'cloud_or_cirrus': 'not_cloud_or_cirrus',
                'nodata': False
            }
        else:
            quality_flags_prod = pq_categories_ls

        if product_type == 'fc':
            pq_mask = masking.make_mask(ds_fc_pq[fmask_band],
                                        **quality_flags_prod)
        else:
            pq_mask = masking.make_mask(ds[fmask_band], **quality_flags_prod)

    # collection 1 USGS
    if product_type == 'c1':
        if pq_categories_ls is None:
            quality_flags_prod = {
                'cloud': 'no_cloud',
                'cloud_shadow': 'no_cloud_shadow',
                'nodata': False
            }
        else:
            quality_flags_prod = pq_categories_ls

        pq_mask = masking.make_mask(ds[fmask_band], **quality_flags_prod)
    # sentinel 2
    if product_type == 's2':
        #currently broken for mask band values >=8
        #pq_mask = odc.algo.fmask_to_bool(ds[fmask_band],
        #                             categories=pq_categories_s2)
        flags_s2 = dc.list_measurements().loc[
            products[0]].loc[fmask_band]['flags_definition']['qa']['values']
        pq_mask = ds[fmask_band].isin(
            [int(k) for k, v in flags_s2.items() if v in pq_categories_s2])

    # The good data percentage calculation has to load in all `fmask`
    # data, which can be slow. If the user has chosen no filtering
    # by using the default `min_gooddata = 0`, we can skip this step
    # completely to save processing time
    if min_gooddata > 0.0:

        # Compute good data for each observation as % of total pixels
        print('Counting good quality pixels for each time step')
        data_perc = (pq_mask.sum(axis=[1, 2], dtype='int32') /
                     (pq_mask.shape[1] * pq_mask.shape[2]))

        keep = data_perc >= min_gooddata

        # Filter by `min_gooddata` to drop low quality observations
        total_obs = len(ds.time)
        ds = ds.sel(time=keep)
        pq_mask = pq_mask.sel(time=keep)
        print(f'Filtering to {len(ds.time)} out of {total_obs} '
              f'time steps with at least {min_gooddata:.1%} '
              f'good quality pixels')

    ###############
    # Apply masks #
    ###############

    # Generate good quality data mask
    mask = None
    if mask_pixel_quality:
        print('Applying pixel quality/cloud mask')
        mask = pq_mask

    # Split into data/masks bands, as conversion to float and masking
    # should only be applied to data bands
    if product_type == 'fc':
        ds_data = ds
    else:
        ds_data = ds[data_bands]
        ds_masks = ds[mask_bands]

    # Mask data if either of the above masks were generated
    if mask is not None:
        ds_data = odc.algo.keep_good_only(ds_data, where=mask)

    # Automatically set dtype to either native or float32 depending
    # on whether masking was requested
    if dtype == 'auto':
        dtype = 'native' if mask is None else 'float32'

    # Set nodata values using odc.algo tools to reduce peak memory
    # use when converting data dtype
    if dtype != 'native':
        ds_data = odc.algo.to_float(ds_data, dtype=dtype)

    # Put data and mask bands back together
    if product_type == 'fc':
        attrs = ds.attrs
        ds = ds_data
        ds.attrs.update(attrs)
    else:
        attrs = ds.attrs
        ds = xr.merge([ds_data, ds_masks])
        ds.attrs.update(attrs)

    ###############
    # Return data #
    ###############

    # Drop bands not originally requested by user
    if requested_measurements:
        ds = ds[requested_measurements]

    # Scale data 0-1 if requested
    if scaling == 'normalised':

        if product_type == 'c1':
            print("Re-scaling Landsat C1 data")
            not_sr_bands = ['pixel_qa', 'sr_aerosol', 'radsat_qa']

            for band in ds.data_vars:
                if band not in not_sr_bands:
                    ds[band] = ds[band] / 10000

        if product_type == 's2':
            print("Re-scaling Sentinel-2 data")
            not_sr_bands = [
                'scl', 'qa', 'mask', 'water_vapour',
                'aerosol_optical_thickness'
            ]

            for band in ds.data_vars:
                if band not in not_sr_bands:
                    ds[band] = ds[band] / 10000

    # Collection 2 Landsat raw values aren't useful so rescale,
    # need different factors for surface-temp and SR
    if product_type == 'c2':
        print("Re-scaling Landsat C2 data")
        not_sr_bands = [
            'thermal_radiance', 'upwell_radiance', 'upwell_radiance',
            'atmospheric_transmittance', 'emissivity', 'emissivity_stdev',
            'cloud_distance', 'quality_l2_aerosol',
            'quality_l2_surface_temperature', 'quality_l1_pixel',
            'quality_l1_radiometric_saturation', 'surface_temperature'
        ]

        for band in ds.data_vars:

            if band == 'surface_temperature':
                ds[band] = ds[band] * 0.00341802 + 149.0 - 273.15

            if band not in not_sr_bands:
                ds[band] = ds[band] * 2.75e-5 - 0.2

    # If user supplied dask_chunks, return data as a dask array without
    # actually loading it in
    if dask_chunks is not None:
        print(f'Returning {len(ds.time)} time steps as a dask array')
        return ds
    else:
        print(f'Loading {len(ds.time)} time steps')
        return ds.compute()
Example #10
0
    def process_data(self, data, parameters):
        wofs_mask_flags = [
            dict(dry=True),
            dict(terrain_or_low_angle=False,
                 high_slope=False,
                 cloud_shadow=False,
                 cloud=False,
                 sea=False)
        ]

        water = data.data_vars['water']
        data = data.drop_vars(['water'])

        total = data.count(dim=['x', 'y'])
        total_valid = (data != -1).sum(dim=['x', 'y'])

        # TODO enable this check, investigate why it fails
        # if total_valid <= 0:
        #     raise ProcessError('query returned no data')

        for m in wofs_mask_flags:
            mask = make_mask(water, **m)
            data = data.where(mask)

        total_invalid = (np.isnan(data)).sum(dim=['x', 'y'])
        not_pixels = total_valid - (total - total_invalid)

        # following robbi's advice, cast the dataset to a dataarray
        maxFC = data.to_array(dim='variable', name='maxFC')

        # turn FC array into integer only as nanargmax doesn't seem to handle floats the way we want it to
        FC_int = maxFC.astype('int16')

        # use numpy.nanargmax to get the index of the maximum value along the variable dimension
        # BSPVNPV=np.nanargmax(FC_int, axis=0)
        BSPVNPV = FC_int.argmax(dim='variable')

        FC_mask = np.isfinite(maxFC).all(dim='variable')  # pylint: disable=no-member,unexpected-keyword-arg

        # #re-mask with nans to remove no-data
        BSPVNPV = BSPVNPV.where(FC_mask)

        FC_dominant = xarray.Dataset({
            'BS': (BSPVNPV == 0).where(FC_mask),
            'PV': (BSPVNPV == 1).where(FC_mask),
            'NPV': (BSPVNPV == 2).where(FC_mask)
        })

        FC_count = FC_dominant.sum(dim=['x', 'y'])

        # Fractional cover pixel count method
        # Get number of FC pixels, divide by total number of pixels per polygon
        new_ds = xarray.Dataset({
            'BS': (FC_count.BS / total_valid)['BS'] * 100,
            'PV': (FC_count.PV / total_valid)['PV'] * 100,
            'NPV': (FC_count.NPV / total_valid)['NPV'] * 100,
            'Unobservable': (not_pixels / total_valid)['BS'] * 100
        })

        print('dask compute')
        dask_time = default_timer()
        new_ds = new_ds.compute()
        print('dask took', default_timer() - dask_time, 'seconds')
        print(new_ds)

        df = new_ds.to_dataframe()
        df = df.drop('spatial_ref', axis=1)
        df.reset_index(inplace=True)
        return df
Example #11
0
def load_nbarx(dc,
               sensor,
               query,
               product='nbart',
               bands_of_interest='',
               filter_pq=True):
    """
    Loads NBAR (Nadir BRDF Adjusted Reflectance) or NBAR-T (terrain corrected NBAR) data for a
    sensor, masks using pixel quality (PQ), then optionally filters out terrain -999s (for NBAR-T).
    Returns an xarray dataset and CRS and Affine objects defining map projection and geotransform

    Last modified: May 2018
    Author: Bex Dunn
    Modified by: Claire Krause, Robbi Bishop-Taylor, Bex Dunn

    inputs
    dc - Handle for the Datacube to import from. This allows you to also use dev environments
    if that have been imported into the environment.
    sensor - Options are 'ls5', 'ls7', 'ls8'
    query - A dict containing the query bounds. Can include lat/lon, time etc. 

    optional
    product - 'nbar' or 'nbart'. Defaults to nbart unless otherwise specified
    bands_of_interest - List of strings containing the bands to be read in; defaults to all bands,
                        options include 'red', 'green', 'blue', 'nir', 'swir1', 'swir2'
    filter_pq - boolean. Will filter clouds and saturated pixels using PQ unless set to False


    outputs
    ds - Extracted and optionally PQ filtered dataset
    crs - CRS object defining dataset coordinate reference system
    affine - Affine object defining dataset affine transformation
    """

    product_name = '{}_{}_albers'.format(sensor, product)
    mask_product = '{}_{}_albers'.format(sensor, 'pq')
    print('Loading {}'.format(product_name))

    # If bands of interest are given, assign measurements in dc.load call
    if bands_of_interest:

        ds = dc.load(product=product_name,
                     measurements=bands_of_interest,
                     group_by='solar_day',
                     **query)

    # If no bands of interest given, run without specifying measurements
    else:

        ds = dc.load(product=product_name, group_by='solar_day', **query)

    # Proceed if the resulting call returns data
    if ds.variables:

        crs = ds.crs
        affine = ds.affine
        print('Loaded {}'.format(product_name))

        # If pixel quality filtering is enabled, extract PQ data to use as mask
        if filter_pq:

            sensor_pq = dc.load(product=mask_product,
                                fuse_func=ga_pq_fuser,
                                group_by='solar_day',
                                **query)

            # If PQ call returns data, use to mask input data
            if sensor_pq.variables:
                print('Generating mask {}'.format(mask_product))
                good_quality = masking.make_mask(
                    sensor_pq.pixelquality,
                    cloud_acca='no_cloud',
                    cloud_shadow_acca='no_cloud_shadow',
                    cloud_shadow_fmask='no_cloud_shadow',
                    cloud_fmask='no_cloud',
                    blue_saturated=False,
                    green_saturated=False,
                    red_saturated=False,
                    nir_saturated=False,
                    swir1_saturated=False,
                    swir2_saturated=False,
                    contiguous=True)

                # Apply mask to preserve only good data
                ds = ds.where(good_quality)

            ds.attrs['crs'] = crs
            ds.attrs['affine'] = affine

            # Replace nodata values with nans

            ds = masking.mask_invalid_data(ds)

        return ds, crs, affine

    else:

        print('Failed to load {}'.format(product_name))
        return None, None, None
Example #12
0
def load_clearlandsat(dc,
                      query,
                      sensors=('ls5', 'ls7', 'ls8'),
                      product='nbart',
                      dask_chunks={'time': 1},
                      lazy_load=False,
                      bands_of_interest=None,
                      masked_prop=0.0,
                      mask_dict=None,
                      mask_pixel_quality=True,
                      mask_invalid_data=True,
                      ls7_slc_off=False,
                      satellite_metadata=False):
    """Loads Landsat NBAR, NBART or FC25 and PQ data for multiple sensors (i.e. ls5, ls7, ls8) and returns a single 
    xarray dataset containing only observations that contain greater than a given proportion of good quality pixels.
    
    This function can be used to extract visually appealing time series of observations that are not affected by cloud,
    for example as an input to the `animated_timeseries` function from `DEAPlotting`.
    
    The proportion of clear pixels is calculated by summing the pixels that are marked as being good quality
    in the Landsat PQ25 layer. By default cloud, cloud shadow, saturated pixels and pixels missing data for any band 
    are considered poor quality data, but this can be customised using the `mask_dict` parameter.
    
    Last modified: March 2019
    Author: Robbi Bishop-Taylor, Bex Dunn    
    
    Parameters
    ----------    
    dc : datacube Datacube object
        A specific Datacube to import from, i.e. `dc = datacube.Datacube(app='Clear Landsat')`. This allows you to 
        also use development datacubes if they have been imported into the environment.    
    query : dict
        A dict containing the query bounds. Can include lat/lon, time etc. If no `time` query is given, the 
        function defaults to all timesteps available to all sensors (e.g. 1987-2018)
    sensors : list, optional
        An optional list of Landsat sensor names to load data for. Options are 'ls5', 'ls7', 'ls8'; defaults to all.
    product : str, optional
        An optional string specifying 'nbar', 'nbart' or 'fc'. Defaults to 'nbart'. For information on the difference, 
        see the '02_DEA_datasets/Introduction_to_Landsat' or '02_DEA_datasets/Introduction_to_Fractional_Cover'
        notebooks from DEA-notebooks.
    dask_chunks : dict, optional
        An optional dictionary containing the coords and sizes you wish to create dask chunks over. Usually
        used in combination with lazy_load=True (see below). example: dask_chunks = {'x': 500, 'y': 500}
    lazy_load : boolean, optional
        Setting this variable to 'True' will delay the computation of the function until you explicitly
        run ds.compute(). If used in conjuction with dask.distributed.Client() will allow 
        for automatic parallel computation. 
    bands_of_interest : list, optional
        An optional list of strings containing the bands to be read in; options include 'red', 'green', 'blue', 
        'nir', 'swir1', 'swir2'; defaults to all available bands if no bands are specified.
    masked_prop : float, optional
        An optional float giving the minimum percentage of good quality pixels required for a Landsat observation to 
        be loaded. Defaults to 0.0 which will return all observations regardless of pixel quality (set to e.g. 0.99 
        to return only observations with more than 99% good quality pixels).
    mask_dict : dict, optional
        An optional dict of arguments to the `masking.make_mask` function that can be used to identify poor
        quality pixels from the PQ layer using alternative masking criteria. The default value of None masks
        out pixels flagged as cloud or cloud shadow by either the ACCA or Fmask algorithms, any saturated pixels, 
        or any pixels that are missing data in any band (equivalent to: `mask_dict={'cloud_acca': 'no_cloud', 
        'cloud_shadow_acca': 'no_cloud_shadow', 'cloud_shadow_fmask': 'no_cloud_shadow', 'cloud_fmask': 'no_cloud', 
        'blue_saturated': False, 'green_saturated': False, 'red_saturated': False, 'nir_saturated': False, 
        'swir1_saturated': False, 'swir2_saturated': False, 'contiguous': True}`. See the 
        `02_DEA_datasets/Introduction_to_LandsatPQ.ipynb` notebook on DEA Notebooks for a list of all possible options.
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the pixel quality mask to all observations that were not
        filtered out for having less good quality pixels that `masked_prop`. For example, if `masked_prop=0.99`, the
        filtered images may still contain up to 1% poor quality pixels. The default of False simply returns the
        resulting observations without masking out these pixels; True masks them out and sets them to NaN using the
        pixel quality mask, but has the side effect of changing the data type of the output arrays from int16 to
        float32 which can cause memory issues. To reduce memory usage, set to False.
    mask_invalid_data : bool, optional
        An optional boolean indicating whether invalid -999 nodata values should be replaced with NaN. Defaults to
        True; this has the side effect of changing the data type of the output arrays from int16 to float32 which
        can cause memory issues. To reduce memory usage, set to False.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from after the Landsat 7 SLC failure (i.e. SLC-off).
        Defaults to False, which removes all Landsat 7 observations after May 31 2003. 
    satellite_metadata : bool, optional
        An optional boolean indicating whether to return the dataset with a `satellite` variable that gives the name 
        of the satellite that made each observation in the timeseries (i.e. ls5, ls7, ls8). Defaults to False. 
    
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only Landsat observations that contain greater than `masked_prop`
        proportion of clear pixels.   
        
    Notes
    -----
    Memory issues: For large data extractions, it is recommended that you set both `mask_pixel_quality=False` and 
    `mask_invalid_data=False`. Otherwise, all output variables will be coerced to float32 when NaN values are 
    inserted into the array, potentially causing your data to use 2x as much memory. Be aware that the resulting
    arrays will contain invalid -999 values which should be considered in analyses.
        
    Example
    -------    
    >>> # Import modules
    >>> import datacube
    >>> import sys
    >>> # Import dea-notebooks functions using relative link to 10_Scripts directory
    >>> sys.path.append('../10_Scripts')
    >>> import DEADataHandling
    >>> # Connect to a datacube containing Landsat data
    >>> dc = datacube.Datacube(app='load_clearlandsat')
    >>> # Set up spatial and temporal query
    >>> query = {'x': (954163, 972163),
    ...          'y': (-3573891, -3555891),
    ...          'time': ('2011-06-01', '2013-06-01'),
    ...          'crs': 'EPSG:3577'}   
    >>> # Load observations with more than 75% good quality pixels from ls5, ls7 and ls8 as a combined dataset
    >>> landsat_ds = DEADataHandling.load_clearlandsat(dc=dc, query=query, sensors=['ls5', 'ls7', 'ls8'], 
    ...                                    bands_of_interest=['red', 'green', 'blue'], 
    ...                                    masked_prop=0.75, mask_pixel_quality=True, ls7_slc_off=True)
    Loading ls5
        Loading 4 filtered ls5 timesteps
    Loading ls7
        Loading 29 filtered ls7 timesteps
    Loading ls8
        Loading 3 filtered ls8 timesteps
    Combining and sorting ls5, ls7, ls8 data
        Replacing invalid -999 values with NaN (data will be coerced to float32)
    >>> # Test that function returned data
    >>> len(landsat_ds.time) > 0
    True
                
    """

    #######################
    # Process each sensor #
    #######################

    #warn if loading a pq bitstring product and attempting to mask it (and therefore cast to float)
    if product == 'pq' and (mask_invalid_data or mask_pixel_quality):
        warnings.warn(
            """You are attempting to load pixel quality product with a mask flag
                        (mask_invalid_data or mask_pixel_quality). Pixel quality is a bitstring 
                        (only makes sense as int) and masking
                        casts to float32.""")

    # Dictionary to save results from each sensor
    filtered_sensors = {}

    # Iterate through all sensors, returning only observations with > mask_prop clear pixels
    for sensor in sensors:

        # Load PQ data using dask
        print(f'Loading {sensor}')

        # If bands of interest are given, assign measurements in dc.load call. This is
        # for compatibility with the existing dea-notebooks load_nbarx function.
        if bands_of_interest:

            # Lazily load Landsat data using dask
            data = dc.load(product=f'{sensor}_{product}_albers',
                           measurements=bands_of_interest,
                           group_by='solar_day',
                           dask_chunks=dask_chunks,
                           **query)

        # If no bands of interest given, run without specifying measurements, and
        # therefore return all available bands
        else:

            # Lazily load Landsat data using dask
            data = dc.load(product=f'{sensor}_{product}_albers',
                           group_by='solar_day',
                           dask_chunks=dask_chunks,
                           **query)

        # Load PQ data
        pq = dc.load(product=f'{sensor}_pq_albers',
                     group_by='solar_day',
                     fuse_func=ga_pq_fuser,
                     dask_chunks=dask_chunks,
                     **query)

        # If resulting dataset has data, continue:
        if data.variables:

            # Remove Landsat 7 SLC-off from PQ layer if ls7_slc_off=False
            if not ls7_slc_off and sensor == 'ls7':

                print('    Ignoring SLC-off observations for ls7')
                data = data.sel(time=data.time < np.datetime64('2003-05-30'))

            # If more than 0 timesteps
            if len(data.time) > 0:

                # Return only Landsat observations that have matching PQ data
                time = (data.time - pq.time).time
                data = data.sel(time=time)
                pq = pq.sel(time=time)

                # If a custom dict is provided for mask_dict, use these values to make mask from PQ
                if mask_dict:

                    # Mask PQ using custom values by unpacking mask_dict **kwarg
                    good_quality = masking.make_mask(pq.pixelquality,
                                                     **mask_dict)

                else:

                    # Identify pixels with no clouds in either ACCA for Fmask
                    good_quality = masking.make_mask(
                        pq.pixelquality,
                        cloud_acca='no_cloud',
                        cloud_shadow_acca='no_cloud_shadow',
                        cloud_shadow_fmask='no_cloud_shadow',
                        cloud_fmask='no_cloud',
                        blue_saturated=False,
                        green_saturated=False,
                        red_saturated=False,
                        nir_saturated=False,
                        swir1_saturated=False,
                        swir2_saturated=False,
                        contiguous=True)

                # Compute good data for each observation as a percentage of total array pixels. Need to
                # sum over x and y axes individually so that the function works with lat-lon dimensions,
                # and because it isn't currently possible to pass a list of axes (bug with xarray?)
                data_perc = good_quality.sum(axis=1).sum(
                    axis=1) / (good_quality.shape[1] * good_quality.shape[2])

                # Add data_perc data to Landsat dataset as a new xarray variable
                data['data_perc'] = xr.DataArray(data_perc,
                                                 [('time', data.time)])

                # Filter by data_perc to drop low quality observations and finally import data using dask
                filtered = data.sel(time=data.data_perc >= masked_prop)
                print(
                    f'    Loading {len(filtered.time)} filtered {sensor} timesteps'
                )

                # Optionally apply pixel quality mask to all observations that were not dropped in previous step
                if mask_pixel_quality:

                    # First change dtype to float32, then mask out values using
                    # `.where()`. By casting to float32, we prevent `.where()`
                    # from automatically casting to float64, using 2x the memory
                    # We also need to manually reset attributes due to a possible
                    # bug in recent xarray version
                    filtered = filtered.astype(
                        np.float32).assign_attrs(crs=filtered.crs)
                    filtered = filtered.where(good_quality)

                # Optionally add satellite name variable
                if satellite_metadata:
                    filtered['satellite'] = xr.DataArray(
                        [sensor] * len(filtered.time),
                        [('time', filtered.time)])

                # Add result to dictionary
                if lazy_load == True:
                    filtered_sensors[sensor] = filtered
                else:
                    filtered_sensors[sensor] = filtered.compute()

                # Close datasets
                filtered = None
                good_quality = None
                data = None
                pq = None

            else:

                # If there is no data for sensor or if another error occurs:
                print(f'    Skipping {sensor}; no valid data for query')

        else:

            # If there is no data for sensor or if another error occurs:
            print(f'    Skipping {sensor}; no valid data for query')

    ############################
    # Combine multiple sensors #
    ############################

    # Proceed with concatenating only if there is more than 1 sensor processed
    if len(filtered_sensors) > 1:

        # Concatenate all sensors into one big xarray dataset, and then sort by time
        sensor_string = ", ".join(filtered_sensors.keys())
        print(f'Combining and sorting {sensor_string} data')
        combined_ds = xr.concat(filtered_sensors.values(), dim='time')
        combined_ds = combined_ds.sortby('time')

        # Optionally filter to replace no data values with nans
        if mask_invalid_data:

            print(
                '    Replacing invalid -999 values with NaN (data will be coerced to float32)'
            )

            # First change dtype to float32, then mask out values using
            # `.where()`. By casting to float32, we prevent `.where()`
            # from automatically casting to float64, using 2x the memory
            # We also need to manually reset attributes due to a possible
            # bug in recent xarray version
            combined_ds = (combined_ds.astype(
                np.float32).assign_attrs(crs=combined_ds.crs))
            combined_ds = masking.mask_invalid_data(combined_ds)

        # reset pixel quality attributes
        if product == 'pq':
            combined_ds.pixelquality.attrs.update(
                list(filtered_sensors.values())[0].pixelquality.attrs)

        # Return combined dataset
        return combined_ds

    # Return the single dataset if only one sensor was processed
    elif len(filtered_sensors) == 1:

        sensor_string = ", ".join(filtered_sensors.keys())
        print(f'Returning {sensor_string} data')
        sensor_ds = list(filtered_sensors.values())[0]

        # Optionally filter to replace no data values with nans
        if mask_invalid_data:

            print(
                '    Replacing invalid -999 values with NaN (data will be coerced to float32)'
            )

            # First change dtype to float32, then mask out values using
            # `.where()`. By casting to float32, we prevent `.where()`
            # from automatically casting to float64, using 2x the memory
            # We also need to manually reset attributes due to a possible
            # bug in recent xarray version
            sensor_ds = (sensor_ds.astype(
                np.float32).assign_attrs(crs=sensor_ds.crs))
            sensor_ds = masking.mask_invalid_data(sensor_ds)

        return sensor_ds

    else:

        print(
            f'No data returned for query for any sensor in {", ".join(sensors)} '
            f'and time range {"-".join(query["time"])}')