Esempio n. 1
0
def test_create_mask_value(simple_var):
    bits_def = simple_var.flags_definition

    assert create_mask_value(bits_def, contiguous=True) == (256, 256)
    assert create_mask_value(bits_def, contiguous=False) == (256, 0)
    assert create_mask_value(bits_def, contiguous=True, land_sea='land') == (
        768, 768)
    assert create_mask_value(bits_def, contiguous=False, land_sea='land') == (768, 512)
Esempio n. 2
0
def load_ard(
    dc,
    products=None,
    min_gooddata=0.0,
    categories_to_mask_ls=dict(cloud="high_confidence",
                               cloud_shadow="high_confidence"),
    categories_to_mask_s2=[
        "cloud high probability",
        "Cloud medium probability",
        "Thin cirrus",
        "Saturated or defective pixel",
    ],
    categories_to_mask_s1=["invalid data"],
    mask_filters=None,
    mask_pixel_quality=True,
    ls7_slc_off=True,
    predicate=None,
    dtype="auto",
    verbose=True,
    **kwargs,
):
    """
    Loads analysis ready data.
    Loads and combines Landsat USGS Collections 2, Sentinel-2, and Sentinel-1 for
    multiple sensors (i.e. ls5t, ls7e and ls8c for Landsat; s2a and s2b for Sentinel-2),
    optionally applies pixel quality masks, and drops time steps that
    contain greater than a minimum proportion of good quality (e.g. non-
    cloudy or shadowed) pixels.
    The function supports loading the following DE Africa products:
    Landsat:
        * ls5_sr ('sr' denotes surface reflectance)
        * ls7_sr
        * ls8_sr
        * ls5_st ('st' denotes surface temperature)
        * ls7_st
        * ls8_st
    Sentinel-2:
        * s2_l2a
    Sentinel-1:
        * s1_rtc
    Last modified: August 2021
    Parameters
    ----------
    dc : datacube Datacube object
        The Datacube to connect to, i.e. `dc = datacube.Datacube()`.
        This allows you to also use development datacubes if required.
    products : list
        A list of product names to load data from. For example:
        * Landsat C2: `['ls5_sr', 'ls7_sr', 'ls8_sr']`
        * Sentinel-2: `['s2_l2a']`
        * Sentinel-1: `['s1_rtc']`
    min_gooddata : float, optional
        An optional float giving the minimum percentage of good quality
        pixels required for a satellite observation to be loaded.
        Defaults to 0.0 which will return all observations regardless of
        pixel quality (set to e.g. 0.99 to return only observations with
        more than 99% good quality pixels).
    categories_to_mask_ls : dict, optional
        An optional dictionary that is used to identify poor quality pixels
        for masking. This mask is used for both masking out low
        quality pixels (e.g. cloud or shadow), and for dropping
        observations entirely based on the `min_gooddata` calculation.
    categories_to_mask_s2 : list, optional
        An optional list of Sentinel-2 Scene Classification Layer (SCL) names
        that identify poor quality pixels for masking.
    categories_to_mask_s1 : list, optional
        An optional list of Sentinel-1 mask names that identify poor
        quality pixels for masking.
    mask_filters : iterable of tuples, optional
        Iterable tuples of morphological operations - ("<operation>", <radius>)
        to apply on mask, where:
        operation: string, can be one of these morphological operations:
                closing  = remove small holes in cloud - morphological closing
                opening  = shrinks away small areas of the mask
                dilation = adds padding to the mask
                erosion  = shrinks bright regions and enlarges dark regions
        radius: int
        e.g. mask_filters=[('erosion', 5),("opening", 2),("dilation", 2)]
    mask_pixel_quality : bool, optional
        An optional boolean indicating whether to apply the poor data
        mask to all observations that were not filtered out for having
        less good quality pixels than `min_gooddata`. E.g. if
        `min_gooddata=0.99`, the filtered observations may still contain
        up to 1% poor quality pixels. The default of False simply
        returns the resulting observations without masking out these
        pixels; True masks them and sets them to NaN using the poor data
        mask. This will convert numeric values to floating point values
        which can cause memory issues, set to False to prevent this.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to
        True, which keeps all Landsat 7 observations > May 31 2003.
    predicate : function, optional
        An optional function that can be passed in to restrict the
        datasets that are loaded by the function. A filter function
        should take a `datacube.model.Dataset` object as an input (i.e.
        as returned from `dc.find_datasets`), and return a boolean.
        For example, a filter function could be used to return True on
        only datasets acquired in January:
        `dataset.time.begin.month == 1`
    dtype : string, optional
        An optional parameter that controls the data type/dtype that
        layers are coerced to after loading. Valid values: 'native',
        'auto', 'float{16|32|64}'. When 'auto' is used, the data will be
        converted to `float32` if masking is used, otherwise data will
        be returned in the native data type of the data. Be aware that
        if data is loaded in its native dtype, nodata and masked
        pixels will be returned with the data's native nodata value
        (typically -999), not NaN. NOTE: If loading Landsat, the data is
        automatically rescaled so 'native' dtype will return a value error.
    verbose : bool, optional
        If True, print progress statements during loading
    **kwargs : dict, optional
        A set of keyword arguments to `dc.load` that define the
        spatiotemporal query used to extract data. This typically
        includes `measurements`, `x`, `y`, `time`, `resolution`,
        `resampling`, `group_by` and `crs`. Keyword arguments can
        either be listed directly in the `load_ard` call like any
        other parameter (e.g. `measurements=['red']`), or by
        passing in a query kwarg dictionary (e.g. `**query`). For a
        list of possible options, see the `dc.load` documentation:
        https://datacube-core.readthedocs.io/en/latest/dev/api/generate/datacube.Datacube.load.html
    Returns
    -------
    combined_ds : xarray Dataset
        An xarray dataset containing only satellite observations that
        contains greater than `min_gooddata` proportion of good quality
        pixels.
    """

    #########
    # Setup #
    #########
    # prevent function altering original query object
    kwargs = deepcopy(kwargs)

    # We deal with `dask_chunks` separately
    dask_chunks = kwargs.pop("dask_chunks", None)
    requested_measurements = kwargs.pop("measurements", None)

    # Warn user if they combine lazy load with min_gooddata
    if verbose:
        if (min_gooddata > 0.0) and dask_chunks is not None:
            warnings.warn("Setting 'min_gooddata' percentage to > 0.0 "
                          "will cause dask arrays to compute when "
                          "loading pixel-quality data to calculate "
                          "'good pixel' percentage. This can "
                          "slow the return of your dataset.")

    # Verify that products were provided and determine if Sentinel-2
    # or Landsat data is being loaded
    if not products:
        raise ValueError(
            "Please provide a list of product names to load data from. "
            "Valid options are: Landsat C2 SR: ['ls5_sr', 'ls7_sr', 'ls8_sr'], or "
            "Landsat C2 ST: ['ls5_st', 'ls7_st', 'ls8_st'], or "
            "Sentinel-2: ['s2_l2a'], or"
            "Sentinel-1: ['s1_rtc'], or")

    # convert products to list if user passed as a string
    if type(products) == str:
        products = [products]

    if all(["ls" in product for product in products]):
        product_type = "ls"
    elif all(["s2" in product for product in products]):
        product_type = "s2"
    elif all(["s1" in product for product in products]):
        product_type = "s1"

    # check if the landsat product is surface temperature
    st = False
    if (product_type == "ls") & (all(["st" in product
                                      for product in products])):
        st = True

    # Check some parameters before proceeding
    if (product_type == "ls") & (dtype == "native"):
        raise ValueError(
            "Cannot load Landsat bands in native dtype "
            "as values require rescaling which converts dtype to float")

    if product_type == "ls":
        if any(k in categories_to_mask_ls
               for k in ("cirrus", "cirrus_confidence")):
            raise ValueError("'cirrus' categories for the pixel quality mask"
                             " are not supported by load_ard")

    # If `measurements` are specified but do not include pixel quality bands,
    #  add these to `measurements` according to collection
    if product_type == "ls":
        if verbose:
            print("Using pixel quality parameters for USGS Collection 2")
        fmask_band = "pixel_quality"

    elif product_type == "s2":
        if verbose:
            print("Using pixel quality parameters for Sentinel 2")
        fmask_band = "scl"

    elif product_type == "s1":
        if verbose:
            print("Using pixel quality parameters for Sentinel 1")
        fmask_band = "mask"

    measurements = requested_measurements.copy(
    ) if requested_measurements else None

    # define a list of acceptable aliases to load landsat. We can't rely on 'common'
    # measurements as native band names have the same name for different measurements.
    ls_aliases = ["pixel_quality", "radiometric_saturation"]
    if st:
        ls_aliases = [
            "surface_temperature",
            "surface_temperature_quality",
            "atmospheric_transmittance",
            "thermal_radiance",
            "emissivity",
            "emissivity_stddev",
            "cloud_distance",
            "upwell_radiance",
            "downwell_radiance",
        ] + ls_aliases
    else:
        ls_aliases = ["red", "green", "blue", "nir", "swir_1", "swir_2"
                      ] + ls_aliases

    if measurements is not None:
        if product_type == "ls":

            # check we aren't loading aerosol bands from LS8
            aerosol_bands = [
                "aerosol_qa",
                "qa_aerosol",
                "atmos_opacity",
                "coastal_aerosol",
                "SR_QA_AEROSOL",
            ]
            if any(b in aerosol_bands for b in measurements):
                raise ValueError("load_ard doesn't support loading aerosol or "
                                 "atmospeheric opacity related bands "
                                 "for Landsat, instead use dc.load()")

            # check measurements are in acceptable aliases list for landsat
            if set(measurements).issubset(ls_aliases):
                pass
            else:
                raise ValueError(
                    "load_ard does not support all band aliases for Landsat, "
                    "use only the following band names to load Landsat data: "
                    + str(ls_aliases))

    # Deal with "load all" case: pick a set of bands common across
    # all products
    if measurements is None:
        if product_type == "ls":
            measurements = ls_aliases
        else:
            measurements = _common_bands(dc, products)

    # If `measurements` are specified but do not include pq, add.
    if measurements:
        if fmask_band not in measurements:
            measurements.append(fmask_band)

    # Get list of data and mask bands so that we can later exclude
    # mask bands from being masked themselves (also handle the case of rad_sat)
    data_bands = [
        band for band in measurements
        if band not in (fmask_band, "radiometric_saturation")
    ]
    mask_bands = [band for band in measurements if band not in data_bands]

    #################
    # Find datasets #
    #################

    # Pull out query params only to pass to dc.find_datasets
    query = _dc_query_only(**kwargs)

    # Extract datasets for each product using subset of dcload_kwargs
    dataset_list = []

    # Get list of datasets for each product
    if verbose:
        print("Finding datasets")
    for product in products:

        # Obtain list of datasets for product
        if verbose:
            print(f"    {product}")

        if product_type == "ls":
            # handle LS seperately to S2/S1 due to collection_category
            #force the user to load Tier 1
            datasets = dc.find_datasets(product=product,
                                        collection_category='T1',
                                        **query)
        else:
            datasets = dc.find_datasets(product=product, **query)

        # Remove Landsat 7 SLC-off observations if ls7_slc_off=False
        if not ls7_slc_off and product in ["ls7_sr"]:
            if verbose:
                print("    Ignoring SLC-off observations for ls7")
            datasets = [
                i for i in datasets if
                i.time.begin < datetime.datetime(2003, 5, 31, tzinfo=pytz.UTC)
            ]

        # Add any returned datasets to list
        dataset_list.extend(datasets)

    # Raise exception if no datasets are returned
    if len(dataset_list) == 0:
        raise ValueError("No data available for query: ensure that "
                         "the products specified have data for the "
                         "time and location requested")

    # If predicate is specified, use this function to filter the list
    # of datasets prior to load
    if predicate:
        if verbose:
            print(f"Filtering datasets using filter function")
        dataset_list = [ds for ds in dataset_list if predicate(ds)]

    # Raise exception if filtering removes all datasets
    if len(dataset_list) == 0:
        raise ValueError("No data available after filtering with "
                         "filter function")

    #############
    # Load data #
    #############

    # Note we always load using dask here so that
    # we can lazy load data before filtering by good data
    ds = dc.load(
        datasets=dataset_list,
        measurements=measurements,
        dask_chunks={} if dask_chunks is None else dask_chunks,
        **kwargs,
    )

    ####################
    # Filter good data #
    ####################

    # need to distinguish between products due to different
    # pq band properties

    # collection 2 USGS
    if product_type == "ls":
        mask, _ = masking.create_mask_value(
            ds[fmask_band].attrs["flags_definition"], **categories_to_mask_ls)
        pq_mask = (ds[fmask_band] & mask) != 0

    # sentinel 2
    if product_type == "s2":
        #ds = ds.where((ds[fmask_band] != 0))
        ds = ds.where(ds.scl != 0, drop=True)
        pq_mask = odc.algo.enum_to_bool(mask=ds[fmask_band],
                                        categories=categories_to_mask_s2)
        #print(pq_mask.compute())
    # sentinel 1
    if product_type == "s1":
        pq_mask = odc.algo.enum_to_bool(mask=ds[fmask_band],
                                        categories=categories_to_mask_s1)

    # The good data percentage calculation has to load in all `fmask`
    # data, which can be slow. If the user has chosen no filtering
    # by using the default `min_gooddata = 0`, we can skip this step
    # completely to save processing time
    if min_gooddata > 0.0:

        # Compute good data for each observation as % of total pixels.
        # Inveerting the pq_mask for this because cloud=True in pq_mask
        # and we want to sum good pixels
        if verbose:
            print("Counting good quality pixels for each time step")
        data_perc = (~pq_mask).sum(
            axis=[1, 2], dtype="int32") / (pq_mask.shape[1] * pq_mask.shape[2])

        keep = (data_perc >= min_gooddata).persist()

        # Filter by `min_gooddata` to drop low quality observations
        total_obs = len(ds.time)
        ds = ds.sel(time=keep)
        pq_mask = pq_mask.sel(time=keep)
        if verbose:
            print(f"Filtering to {len(ds.time)} out of {total_obs} "
                  f"time steps with at least {min_gooddata:.1%} "
                  f"good quality pixels")

    # morpholigcal filtering on cloud masks
    if (mask_filters is not None) & (mask_pixel_quality):
        if verbose:
            print(f"Applying morphological filters to pq mask {mask_filters}")
        pq_mask = mask_cleanup(pq_mask, mask_filters=mask_filters)

    ###############
    # Apply masks #
    ###############

    # Generate good quality data mask
    mask = None
    if mask_pixel_quality:
        if verbose:
            print("Applying pixel quality/cloud mask")
        mask = pq_mask

    # Split into data/masks bands, as conversion to float and masking
    # should only be applied to data bands
    ds_data = ds[data_bands]
    ds_masks = ds[mask_bands]

    # Mask data if either of the above masks were generated
    if mask is not None:
        ds_data = odc.algo.erase_bad(ds_data, where=mask)

    # Automatically set dtype to either native or float32 depending
    # on whether masking was requested
    if dtype == "auto":
        dtype = "native" if mask is None else "float32"

    # Set nodata values using odc.algo tools to reduce peak memory
    # use when converting data dtype
    if dtype != "native":
        ds_data = odc.algo.to_float(ds_data, dtype=dtype)

    # Put data and mask bands back together
    attrs = ds.attrs
    ds = xr.merge([ds_data, ds_masks])
    ds.attrs.update(attrs)

    ###############
    # Return data #
    ###############

    # Drop bands not originally requested by user
    if requested_measurements:
        ds = ds[requested_measurements]

    # Apply the scale and offset factors to Collection 2 Landsat. We need
    # different factors for different bands. Also handle the case where
    # masking_pixel_quaity = False, in which case the dtype is still
    # in int, so we convert it to float
    if product_type == "ls":
        if verbose:
            print("Re-scaling Landsat C2 data")

        sr_bands = ["red", "green", "blue", "nir", "swir_1", "swir_2"]
        radiance_bands = [
            "thermal_radiance", "upwell_radiance", "downwell_radiance"
        ]
        trans_emiss = [
            "atmospheric_transmittance", "emissivity", "emissivity_stddev"
        ]
        qa = ["pixel_quality", "radiometric_saturation"]

        if mask_pixel_quality == False:
            # set nodata to NaNs before rescaling
            # in the case where masking hasn't already done this
            for band in ds.data_vars:
                if band not in qa:
                    ds[band] = odc.algo.to_f32(ds[band])

        for band in ds.data_vars:
            if band == "cloud_distance":
                ds[band] = 0.01 * ds[band]

            if band == "surface_temperature_quality":
                ds[band] = 0.01 * ds[band]

            if band in radiance_bands:
                ds[band] = 0.001 * ds[band]

            if band in trans_emiss:
                ds[band] = 0.0001 * ds[band]

            if band in sr_bands:
                ds[band] = 2.75e-5 * ds[band] - 0.2

            if band == "surface_temperature":
                ds[band] = ds[band] * 0.00341802 + 149.0

        # add back attrs that are lost during scaling calcs
        for band in ds.data_vars:
            ds[band].attrs.update(attrs)

    # If user supplied dask_chunks, return data as a dask array without
    # actually loading it in
    if dask_chunks is not None:
        if verbose:
            print(f"Returning {len(ds.time)} time steps as a dask array")
        return ds
    else:
        if verbose:
            print(f"Loading {len(ds.time)} time steps")
        return ds.compute()
Esempio n. 3
0
def test_ga_good_pixel(simple_var):
    bits_def = simple_var.flags_definition

    assert create_mask_value(bits_def, ga_good_pixel=True) == (16383, 16383)
Esempio n. 4
0
def test_create_multi_mask_value():
    multi_var = VariableWithMultiBitFlags()
    multi_flags_def = multi_var.flags_definition

    assert create_mask_value(multi_flags_def, filled=True) == (1, 1)
    assert create_mask_value(multi_flags_def, water_confidence='water') == (0b011000, 0b011000)

    assert create_mask_value(multi_flags_def, water_confidence='water', filled=True) == (
        0b011001, 0b011001)
    assert create_mask_value(multi_flags_def, water_confidence='not_determined') == (0b011000, 0b0)
    assert create_mask_value(multi_flags_def, water_confidence='no_water') == (0b11000, 0b01000)
    assert create_mask_value(multi_flags_def, veg_confidence='maybe_veg') == (
        0b110000000, 0b100000000)
    assert create_mask_value(multi_flags_def,
                             veg_confidence='maybe_veg',
                             water_confidence='water') == (0b110011000, 0b100011000)
    assert create_mask_value(multi_flags_def,
                             veg_confidence='maybe_veg',
                             water_confidence='water', filled=True) == (0b110011001, 0b100011001)

    assert create_mask_value(multi_flags_def, water_confidence='maybe_water') == (0b011000, 0b10000)

    with pytest.raises(ValueError):
        create_mask_value(multi_flags_def, this_flag_doesnot_exist=9)

    with pytest.raises(ValueError):
        create_mask_value(multi_flags_def, water_confidence='invalid enum value')