Example #1
0
def annual_gm_mads_evi_training(ds):
    dc = datacube.Datacube(app='training')
    
    # grab gm+tmads
    gm_mads=dc.load(product='ga_s2_gm',time='2019',like=ds.geobox,
                   measurements=['red', 'blue', 'green', 'nir',
                                 'swir_1', 'swir_2', 'red_edge_1',
                                 'red_edge_2', 'red_edge_3', 'SMAD',
                                 'BCMAD','EMAD'])
    
    gm_mads['SMAD'] = -np.log(gm_mads['SMAD'])
    gm_mads['BCMAD'] = -np.log(gm_mads['BCMAD'])
    gm_mads['EMAD'] = -np.log(gm_mads['EMAD']/10000)
    
    #calculate band indices on gm
    gm_mads = calculate_indices(gm_mads,
                               index=['EVI','LAI','MNDWI'],
                               drop=False,
                               collection='s2')
    
    #normalise spectral GM bands 0-1
    for band in gm_mads.data_vars:
        if band not in ['SMAD', 'BCMAD','EMAD', 'EVI', 'LAI', 'MNDWI']:
            gm_mads[band] = gm_mads[band] / 10000
    
    #calculate EVI on annual timeseries
    evi = calculate_indices(ds,index=['EVI'], drop=True, normalise=True, collection='s2')
    
    # EVI stats 
    gm_mads['evi_std'] = evi.EVI.std(dim='time')
    gm_mads['evi_10'] = evi.EVI.quantile(0.1, dim='time')
    gm_mads['evi_25'] = evi.EVI.quantile(0.25, dim='time')
    gm_mads['evi_75'] = evi.EVI.quantile(0.75, dim='time')
    gm_mads['evi_90'] = evi.EVI.quantile(0.9, dim='time')
    gm_mads['evi_range'] = gm_mads['evi_90'] - gm_mads['evi_10']
    
    #rainfall climatology
    chirps_S1 = xr_reproject(assign_crs(xr.open_rasterio('/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc'),
                                        crs='epsg:4326'), ds.geobox,"bilinear")
    
    chirps_S2 = xr_reproject(assign_crs(xr.open_rasterio('/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc'), 
                                        crs='epsg:4326'), ds.geobox,"bilinear")
        
    gm_mads['rain_S1'] = chirps_S1
    gm_mads['rain_S2'] = chirps_S2
    
    #slope
    url_slope = "https://deafrica-data.s3.amazonaws.com/ancillary/dem-derivatives/cog_slope_africa.tif"
    slope = rio_slurp_xarray(url_slope, gbox=ds.geobox)
    slope = slope.to_dataset(name='slope')#.chunk({'x':2000,'y':2000})
    
    result = xr.merge([gm_mads,slope],compat='override')

    return result.squeeze()
Example #2
0
 def fun(ds, era):
     #geomedian and tmads
     gm_mads = xr_geomedian_tmad(ds)
     gm_mads = calculate_indices(gm_mads,
                            index=['NDVI','LAI','MNDWI'],
                            drop=False,
                            normalise=False,
                            collection='s2')
     
     gm_mads['sdev'] = -np.log(gm_mads['sdev'])
     gm_mads['bcdev'] = -np.log(gm_mads['bcdev'])
     gm_mads['edev'] = -np.log(gm_mads['edev'])
     
     #rainfall climatology
     if era == '_S1':
         chirps = assign_crs(xr.open_rasterio('/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc'),  crs='epsg:4326')
     if era == '_S2':
         chirps = assign_crs(xr.open_rasterio('/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc'),  crs='epsg:4326')
     
     chirps = xr_reproject(chirps,ds.geobox,"bilinear")
     gm_mads['rain'] = chirps
     
     for band in gm_mads.data_vars:
         gm_mads = gm_mads.rename({band:band+era})
     
     return gm_mads
Example #3
0
    def fun(ds, era):
        # six-month geomedians
        gm_mads = xr_geomedian(ds)
        gm_mads = calculate_indices(
            gm_mads,
            index=["NDVI", "LAI", "MNDWI"],
            drop=False,
            normalise=False,
            collection="s2",
        )

        # rainfall climatology
        if era == "_S1":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )
        if era == "_S2":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )

        chirps = xr_reproject(chirps, ds.geobox, "bilinear")
        gm_mads["rain"] = chirps

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
Example #4
0
    def fun(ds, chirps, chpclim, era):
        ds = calculate_indices(ds,
                               index=['EVI'],
                               drop=False,
                               normalise=False,
                               collection='s2')
        #geomedian and tmads
        gm_mads = xr_geomedian_tmad(ds)
        gm_mads = calculate_indices(gm_mads,
                                    index=['EVI', 'NDVI', 'LAI', 'MNDWI'],
                                    drop=False,
                                    normalise=False,
                                    collection='s2')

        gm_mads['sdev'] = -np.log(gm_mads['sdev'])
        gm_mads['bcdev'] = -np.log(gm_mads['bcdev'])
        gm_mads['edev'] = -np.log(gm_mads['edev'])

        # EVI stats
        gm_mads['evi_10'] = ds.EVI.quantile(0.1, dim='time')
        gm_mads['evi_50'] = ds.EVI.quantile(0.5, dim='time')
        gm_mads['evi_90'] = ds.EVI.quantile(0.9, dim='time')
        gm_mads['evi_range'] = gm_mads['evi_90'] - gm_mads['evi_10']
        gm_mads['evi_std'] = ds.EVI.std(dim='time')

        # rainfall actual
        gm_mads['rain_min'] = chirps.min(dim='time')
        gm_mads['rain_mean'] = chirps.mean(dim='time')
        gm_mads['rain_max'] = chirps.max(dim='time')
        gm_mads['rain_range'] = gm_mads['rain_max'] - gm_mads['rain_min']
        gm_mads['rain_std'] = chirps.std(dim='time')

        # rainfall climatology
        gm_mads['rainclim_min'] = chpclim.min(dim='time')
        gm_mads['rainclim_mean'] = chpclim.mean(dim='time')
        gm_mads['rainclim_max'] = chpclim.max(dim='time')
        gm_mads['rainclim_range'] = gm_mads['rainclim_max'] - gm_mads[
            'rainclim_min']
        gm_mads['rainclim_std'] = chpclim.std(dim='time')

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
Example #5
0
    def fun(ds, chirps, chpclim, era):
        ds = calculate_indices(
            ds, index=["EVI"], drop=False, normalise=False, collection="s2"
        )
        # geomedian and tmads
        gm_mads = xr_geomedian_tmad(ds)
        gm_mads = calculate_indices(
            gm_mads,
            index=["EVI", "NDVI", "LAI", "MNDWI"],
            drop=False,
            normalise=False,
            collection="s2",
        )

        gm_mads["sdev"] = -np.log(gm_mads["sdev"])
        gm_mads["bcdev"] = -np.log(gm_mads["bcdev"])
        gm_mads["edev"] = -np.log(gm_mads["edev"])

        # EVI stats
        gm_mads["evi_10"] = ds.EVI.quantile(0.1, dim="time")
        gm_mads["evi_50"] = ds.EVI.quantile(0.5, dim="time")
        gm_mads["evi_90"] = ds.EVI.quantile(0.9, dim="time")
        gm_mads["evi_range"] = gm_mads["evi_90"] - gm_mads["evi_10"]
        gm_mads["evi_std"] = ds.EVI.std(dim="time")

        # rainfall actual
        gm_mads["rain_min"] = chirps.min(dim="time")
        gm_mads["rain_mean"] = chirps.mean(dim="time")
        gm_mads["rain_max"] = chirps.max(dim="time")
        gm_mads["rain_range"] = gm_mads["rain_max"] - gm_mads["rain_min"]
        gm_mads["rain_std"] = chirps.std(dim="time")

        # rainfall climatology
        gm_mads["rainclim_min"] = chpclim.min(dim="time")
        gm_mads["rainclim_mean"] = chpclim.mean(dim="time")
        gm_mads["rainclim_max"] = chpclim.max(dim="time")
        gm_mads["rainclim_range"] = gm_mads["rainclim_max"] - gm_mads["rainclim_min"]
        gm_mads["rainclim_std"] = chpclim.std(dim="time")

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
Example #6
0
    def fun(ds, era):
        # geomedian and tmads
        # gm_mads = xr_geomedian_tmad(ds)
        gm_mads = xr_geomedian_tmad_new(ds).compute()
        gm_mads = calculate_indices(
            gm_mads,
            index=["NDVI", "LAI", "MNDWI"],
            drop=False,
            normalise=False,
            collection="s2",
        )

        gm_mads["sdev"] = -np.log(gm_mads["sdev"])
        gm_mads["bcdev"] = -np.log(gm_mads["bcdev"])
        gm_mads["edev"] = -np.log(gm_mads["edev"])
        gm_mads = gm_mads.chunk({"x": 2000, "y": 2000})

        # rainfall climatology
        if era == "_S1":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )
        if era == "_S2":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )

        chirps = xr_reproject(chirps, ds.geobox, "bilinear")
        chirps = chirps.chunk({"x": 2000, "y": 2000})
        gm_mads["rain"] = chirps

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
Example #7
0
    def fun(ds, era):
        # normalise SR and edev bands
        for band in ds.data_vars:
            if band not in ["sdev", "bcdev"]:
                ds[band] = ds[band] / 10000

        gm_mads = calculate_indices(
            ds,
            index=["NDVI", "LAI", "MNDWI"],
            drop=False,
            normalise=False,
            collection="s2",
        )

        gm_mads["sdev"] = -np.log(gm_mads["sdev"])
        gm_mads["bcdev"] = -np.log(gm_mads["bcdev"])
        gm_mads["edev"] = -np.log(gm_mads["edev"])

        # rainfall climatology
        if era == "_S1":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )
        if era == "_S2":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )

        chirps = xr_reproject(chirps, ds.geobox, "bilinear")
        gm_mads["rain"] = chirps

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
def get_training_data_for_shp(gdf,
                              index,
                              row,
                              out_arrs,
                              out_vars,
                              products,
                              dc_query,
                              custom_func=None,
                              field=None,
                              calc_indices=None,
                              reduce_func=None,
                              drop=True,
                              zonal_stats=None):
    """
    Function to extract data from the ODC for training a machine learning classifier 
    using a geopandas geodataframe of labelled geometries. 
    This function provides a number of pre-defined methods for producing training data, 
    including calcuating band indices, reducing time series using several summary statistics, 
    and/or generating zonal statistics across polygons.  The 'custom_func' parameter provides 
    a method for the user to supply a custom function for generating features rather than using the
    pre-defined methods.

    Parameters
    ----------
    gdf : geopandas geodataframe
        geometry data in the form of a geopandas geodataframe
    products : list
        a list of products to load from the datacube. 
        e.g. ['ls8_usgs_sr_scene', 'ls7_usgs_sr_scene']
    dc_query : dictionary
        Datacube query object, should not contain lat and long (x or y)
        variables as these are supplied by the 'gdf' variable
    field : string 
        A string containing the name of column with class labels. 
        Field must contain numeric values.
    out_arrs : list 
        An empty list into which the training data arrays are stored.
    out_vars : list 
        An empty list into which the data varaible names are stored.
    custom_func : function, optional 
        A custom function for generating feature layers. If this parameter
        is set, all other options (excluding 'zonal_stats'), will be ignored.
        The result of the 'custom_func' must be a single xarray dataset 
        containing 2D coordinates (i.e x, y - no time dimension). The custom function
        has access to the datacube dataset extracted using the 'dc_query' params,
        along with access to the 'dc_query' dictionary itself, which could be used
        to load other products besides those specified under 'products'.
    calc_indices: list, optional
        If not using a custom func, then this parameter provides a method for
        calculating a number of remote sensing indices (e.g. `['NDWI', 'NDVI']`).
    reduce_func : string, optional 
        Function to reduce the data from multiple time steps to
        a single timestep. Options are 'mean', 'median', 'std',
        'max', 'min', 'geomedian'.  Ignored if 'custom_func' is provided.
    drop : boolean, optional , 
        If this variable is set to True, and 'calc_indices' are supplied, the
        spectral bands will be dropped from the dataset leaving only the
        band indices as data variables in the dataset. Default is True.
    zonal_stats : string, optional
        An optional string giving the names of zonal statistics to calculate 
        for each polygon. Default is None (all pixel values are returned). Supported 
        values are 'mean', 'median', 'max', 'min', and 'std'. Will work in 
        conjuction with a 'custom_func'.


    Returns
    --------
    Two lists, a list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon, and another containing the data variable names.

    """

    # prevent function altering dictionary kwargs
    dc_query = deepcopy(dc_query)

    # remove dask chunks if supplied as using
    # mulitprocessing for parallization
    if 'dask_chunks' in dc_query.keys():
        dc_query.pop('dask_chunks', None)

    # connect to datacube
    dc = datacube.Datacube(app='training_data')

    # set up query based on polygon (convert to WGS84)
    geom = geometry.Geometry(gdf.geometry.values[index].__geo_interface__,
                             geometry.CRS('epsg:4326'))

    # print(geom)
    q = {"geopolygon": geom}

    # merge polygon query with user supplied query params
    dc_query.update(q)

    # Identify the most common projection system in the input query
    output_crs = mostcommon_crs(dc=dc, product=products, query=dc_query)

    # load_ard doesn't handle geomedians
    # TODO: Add support for other sensors
    if 'ga_ls8c_gm_2_annual' in products:
        ds = dc.load(product='ga_ls8c_gm_2_annual', **dc_query)
        ds = ds.where(ds != 0, np.nan)

    else:
        # load data
        with HiddenPrints():
            ds = load_ard(dc=dc,
                          products=products,
                          output_crs=output_crs,
                          **dc_query)

    # create polygon mask
    with HiddenPrints():
        mask = xr_rasterize(gdf.iloc[[index]], ds)

    # mask dataset
    ds = ds.where(mask)

    # Use custom function for training data if it exists
    if custom_func is not None:
        with HiddenPrints():
            data = custom_func(ds)

    else:
        # first check enough variables are set to run functions
        if (len(ds.time.values) > 1) and (reduce_func == None):
            raise ValueError(
                "You're dataset has " + str(len(ds.time.values)) +
                " time-steps, please provide a reduction function," +
                " e.g. reduce_func='mean'")

        if calc_indices is not None:
            # determine which collection is being loaded
            if 'level2' in products[0]:
                collection = 'c2'
            elif 'gm' in products[0]:
                collection = 'c2'
            elif 'sr' in products[0]:
                collection = 'c1'
            elif 's2' in products[0]:
                collection = 's2'

            if len(ds.time.values) > 1:

                if reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)
                        # getattr is equivalent to calling data.reduce_func
                        method_to_call = getattr(data, reduce_func)
                        data = method_to_call(dim='time')

                elif reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)
                    with HiddenPrints():
                        data = calculate_indices(data,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                else:
                    raise Exception(
                        reduce_func + " is not one of the supported" +
                        " reduce functions ('mean','median','std','max','min', 'geomedian')"
                    )

            else:
                with HiddenPrints():
                    data = calculate_indices(ds,
                                             index=calc_indices,
                                             drop=drop,
                                             collection=collection)

        # when band indices are not required, reduce the
        # dataset to a 2d array through means or (geo)medians
        if calc_indices is None:

            if len(ds.time.values) > 1:

                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)

                elif reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    method_to_call = getattr(ds, reduce_func)
                    data = method_to_call('time')
            else:
                data = ds.squeeze()

    if zonal_stats is None:
        # If no zonal stats were requested then extract all pixel values
        flat_train = sklearn_flatten(data)
        # Make a labelled array of identical size
        flat_val = np.repeat(row[field], flat_train.shape[0])
        stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))

    elif zonal_stats in ['mean', 'median', 'std', 'max', 'min']:
        method_to_call = getattr(data, zonal_stats)
        flat_train = method_to_call()
        flat_train = flat_train.to_array()
        stacked = np.hstack((row[field], flat_train))

    else:
        raise Exception(
            zonal_stats + " is not one of the supported" +
            " reduce functions ('mean','median','std','max','min')")

    # Append training data and labels to list
    out_arrs.append(stacked)
    out_vars.append([field] + list(data.data_vars))
Example #9
0
def get_training_data_for_shp(polygons,
                              out,
                              products,
                              dc_query,
                              field=None,
                              calc_indices=None,
                              reduce_func='median',
                              drop=True,
                              zonal_stats=None,
                              collection='c1'):
    """
    Function to extract data for training a classifier using a shapefile 
    of labelled polygons.

    Parameters
    ----------
    polygons : geopandas geodataframe
        polygon data in the form of a geopandas geodataframe
    out : list
        Empty list to contain output data.
    products : list
        a list of products ot load from the datacube. 
        e.g. ['ls8_usgs_sr_scene', 'ls7_usgs_sr_scene']
    dc_query : dictionary
        Datacube query object, should not contain lat and long (x or y)
        variables as these are supplied by the 'polygons' variable
    field : string 
        A string containing name of column with labels in shapefile 
        attribute table. Field must contain numeric values.
    calc_indices: list, optional
        An optional list giving the names of any remote sensing indices 
        to be calculated on the loaded data (e.g. `['NDWI', 'NDVI']`. 
    reduce_func : string, optional 
        Function to reduce the data from multiple time steps to
        a single timestep. Options are 'mean'
    drop : booleam, optional , 'median', or 'geomedian'
        If this variable is set to True, and 'calc_indices' are supplied, the
        spectral bands will be dropped from the dataset leaving only the
        band indices as data variables in the dataset. Default is False.
    zonal_stats: string, optional
        An optional string giving the names of zonal statistics to calculate 
        for the polygon. Default is None (all pixel values). Supported 
        values are 'mean' or 'median' 
    collection: string, optional
        to calculate band indices, the satellite collection is required.
        Options include 'c1' for Landsat C1, 'c2' for Landsat C2, and 
        's2' for Sentinel 2.

    Returns
    --------
    A list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon.

    """
    #prevent function altering dictionary kwargs
    dc_query = deepcopy(dc_query)
    dc = datacube.Datacube(app='training_data')

    #set up some print statements
    i = 0
    if calc_indices is not None:
        print("Calculating indices: " + str(calc_indices))
    if reduce_func is not None:
        print("Reducing data using: " + reduce_func)
    if zonal_stats is not None:
        print("Taking zonal statistic: " + zonal_stats)

    # loop through polys and extract training data
    for index, row in polygons.iterrows():
        print(" Feature {:04}/{:04}\r".format(i + 1, len(polygons)), end='')

        # set up query based on polygon (convert to WGS84)
        geom = geometry.Geometry(polygons.geometry.values[0].__geo_interface__,
                                 geometry.CRS('epsg:4326'))

        q = {"geopolygon": geom}

        # merge polygon query with user supplied query params
        dc_query.update(q)

        # Identify the most common projection system in the input query
        output_crs = mostcommon_crs(dc=dc, product=products, query=dc_query)

        #load_ard doesn't handle geomedians
        if 'ga_ls8c_gm_2_annual' in products:
            ds = dc.load(product='ga_ls8c_gm_2_annual', **dc_query)

        else:
            # load data
            with HiddenPrints():
                ds = load_ard(dc=dc,
                              products=products,
                              output_crs=output_crs,
                              **dc_query)

        # create polygon mask
        mask = rasterio.features.geometry_mask(
            [geom.to_crs(ds.geobox.crs) for geoms in [geom]],
            out_shape=ds.geobox.shape,
            transform=ds.geobox.affine,
            all_touched=False,
            invert=False)

        mask = xr.DataArray(mask, dims=("y", "x"))
        ds = ds.where(mask == False)

        # Check if band indices are wanted
        if calc_indices is not None:

            if len(ds.time.values) > 1:

                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)
                    with HiddenPrints():
                        data = calculate_indices(data,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                elif reduce_func == 'std':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)
                    data = data.std('time')

                elif reduce_func == 'mean':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                    data = data.mean('time')

                elif reduce_func == 'median':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                    data = data.median('time')
            else:
                with HiddenPrints():
                    data = calculate_indices(ds,
                                             index=calc_indices,
                                             drop=drop,
                                             collection=collection)

        # when band indices are not required, reduce the
        # dataset to a 2d array through means or (geo)medians
        if calc_indices is None:
            if (len(ds.time.values) > 1) and (reduce_func == None):
                raise ValueError(
                    "You're dataset has " + str(len(ds.time.values)) +
                    "time-steps, please provide a reduction function, e.g. reduce_func='mean'"
                )

            if len(ds.time.values) > 1:
                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)

                if reduce_func == 'mean':
                    data = ds.mean('time')

                if reduce_func == 'std':
                    data = ds.std('time')

                if reduce_func == 'median':
                    data = ds.median('time')

            else:
                data = ds.squeeze()

        # compute in case we have dask arrays
        data = data.compute()

        if zonal_stats is None:
            # If no summary stats were requested then extract all pixel values
            flat_train = sklearn_flatten(data)
            # Make a labelled array of identical size
            flat_val = np.repeat(row[field], flat_train.shape[0])
            stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))

        elif zonal_stats == 'mean':
            flat_train = data.mean(axis=None, skipna=True)
            flat_train = flat_train.to_array()
            stacked = np.hstack((row[field], flat_train))

        elif zonal_stats == 'median':
            flat_train = data.median(axis=None, skipna=True)
            flat_train = flat_train.to_array()
            stacked = np.hstack((row[field], flat_train))

        # Append training data and label to list
        out.append(stacked)
        i += 1
    # Return a list of labels for columns in output array

    return [field] + list(data.data_vars)
Example #10
0
def load_crophealth_data(lat, lon, buffer):
    """
    Loads Landsat 8 analysis-ready data (ARD) product for the crop health
    case-study area over the last two years.
    Last modified: April 2020
    
    Parameters
    ----------
    lat: float
        The central latitude to analyse
    lon: float
        The central longitude to analyse
    buffer:
         The number of square degrees to load around the central latitude and longitude. 
         For reasonable loading times, set this as `0.1` or lower.

    Returns
    ----------
    ds: xarray.Dataset 
        data set containing combined, masked data
        Masked values are set to 'nan'
    """
    
    # Suppress warnings
    warnings.filterwarnings('ignore')

    # Initialise the data cube. 'app' argument is used to identify this app
    dc = datacube.Datacube(app='Crophealth-app')
    
    # Define area to load
    latitude = (lat - buffer, lat + buffer)
    longitude = (lon - buffer, lon + buffer)

    # Specify the date range
    # Calculated as today's date, subtract 730 days to collect two years of data
    # Dates are converted to strings as required by loading function below
    end_date = dt.date.today()
    start_date = end_date - dt.timedelta(days=730)

    time = (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

    # Construct the data cube query
    products = ["ls8_usgs_sr_scene"]
    
    query = {
        'x': longitude,
        'y': latitude,
        'time': time,
                'measurements': [
            'red',
            'green',
            'blue',
            'nir',
            'swir2'
        ],
        'output_crs': 'EPSG:6933',
        'resolution': (-30, 30)
    }

    # Load the data and mask out bad quality pixels
    ds = load_ard(dc, products=products, min_gooddata=0.5, **query)

    # Calculate the normalised difference vegetation index (NDVI) across
    # all pixels for each image.
    # This is stored as an attribute of the data
    ds = calculate_indices(ds, index='NDVI', collection='s2')

    # Return the data
    return(ds)
Example #11
0
def annual_gm_mads_evi_training(ds):
    dc = datacube.Datacube(app="training")

    # grab gm+tmads
    gm_mads = dc.load(
        product="ga_s2_gm",
        time="2019",
        like=ds.geobox,
        measurements=[
            "red",
            "blue",
            "green",
            "nir",
            "swir_1",
            "swir_2",
            "red_edge_1",
            "red_edge_2",
            "red_edge_3",
            "SMAD",
            "BCMAD",
            "EMAD",
        ],
    )

    gm_mads["SMAD"] = -np.log(gm_mads["SMAD"])
    gm_mads["BCMAD"] = -np.log(gm_mads["BCMAD"])
    gm_mads["EMAD"] = -np.log(gm_mads["EMAD"] / 10000)

    # calculate band indices on gm
    gm_mads = calculate_indices(
        gm_mads, index=["EVI", "LAI", "MNDWI"], drop=False, collection="s2"
    )

    # normalise spectral GM bands 0-1
    for band in gm_mads.data_vars:
        if band not in ["SMAD", "BCMAD", "EMAD", "EVI", "LAI", "MNDWI"]:
            gm_mads[band] = gm_mads[band] / 10000

    # calculate EVI on annual timeseries
    evi = calculate_indices(
        ds, index=["EVI"], drop=True, normalise=True, collection="s2"
    )

    # EVI stats
    gm_mads["evi_std"] = evi.EVI.std(dim="time")
    gm_mads["evi_10"] = evi.EVI.quantile(0.1, dim="time")
    gm_mads["evi_25"] = evi.EVI.quantile(0.25, dim="time")
    gm_mads["evi_75"] = evi.EVI.quantile(0.75, dim="time")
    gm_mads["evi_90"] = evi.EVI.quantile(0.9, dim="time")
    gm_mads["evi_range"] = gm_mads["evi_90"] - gm_mads["evi_10"]

    # rainfall climatology
    chirps_S1 = xr_reproject(
        assign_crs(
            xr.open_rasterio(
                "/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc"
            ),
            crs="epsg:4326",
        ),
        ds.geobox,
        "bilinear",
    )

    chirps_S2 = xr_reproject(
        assign_crs(
            xr.open_rasterio(
                "/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc"
            ),
            crs="epsg:4326",
        ),
        ds.geobox,
        "bilinear",
    )

    gm_mads["rain_S1"] = chirps_S1
    gm_mads["rain_S2"] = chirps_S2

    # slope
    url_slope = "https://deafrica-data.s3.amazonaws.com/ancillary/dem-derivatives/cog_slope_africa.tif"
    slope = rio_slurp_xarray(url_slope, gbox=ds.geobox)
    slope = slope.to_dataset(name="slope")  # .chunk({'x':2000,'y':2000})

    result = xr.merge([gm_mads, slope], compat="override")

    return result.squeeze()
Example #12
0
    def fun(ds, era):
        # normalise SR and edev bands
        for band in ds.data_vars:
            if band not in ["sdev", "bcdev"]:
                ds[band] = ds[band] / 10000

        gm_mads = calculate_indices(
            ds,
            index=["NDVI", "LAI", "MNDWI"],
            drop=False,
            normalise=False,
            collection="s2",
        )

        gm_mads["sdev"] = -np.log(gm_mads["sdev"])
        gm_mads["bcdev"] = -np.log(gm_mads["bcdev"])
        gm_mads["edev"] = -np.log(gm_mads["edev"])

        # rainfall climatology
        if era == "_S1":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )
        if era == "_S2":
            chirps = assign_crs(
                xr.open_rasterio(
                    "/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc"
                ),
                crs="epsg:4326",
            )

        # Clip CHIRPS to ~ S2 tile boundaries so we can handle NaNs local to S2 tile
        xmin, xmax = ds.x.values[0], ds.x.values[-1]
        ymin, ymax = ds.y.values[0], ds.y.values[-1]
        inProj = Proj("epsg:6933")
        outProj = Proj("epsg:4326")
        xmin, ymin = transform(inProj, outProj, xmin, ymin)
        xmax, ymax = transform(inProj, outProj, xmax, ymax)

        # create lat/lon indexing slices - buffer S2 bbox by 0.05deg
        if (xmin < 0) & (xmax < 0):
            x_slice = list(np.arange(xmin + 0.05, xmax - 0.05, -0.05))
        else:
            x_slice = list(np.arange(xmax - 0.05, xmin + 0.05, 0.05))

        if (ymin < 0) & (ymax < 0):
            y_slice = list(np.arange(ymin + 0.05, ymax - 0.05, -0.05))
        else:
            y_slice = list(np.arange(ymin - 0.05, ymax + 0.05, 0.05))

        # index global chirps using buffered s2 tile bbox
        chirps = assign_crs(chirps.sel(x=y_slice, y=x_slice, method="nearest"))

        # fill any NaNs in CHIRPS with local (s2-tile bbox) mean
        chirps = chirps.fillna(chirps.mean())
        chirps = xr_reproject(chirps, ds.geobox, "bilinear")
        gm_mads["rain"] = chirps

        for band in gm_mads.data_vars:
            gm_mads = gm_mads.rename({band: band + era})

        return gm_mads
Example #13
0
def features(ds, era):
    #normalise SR and edev bands
    for band in ds.data_vars:
        if band not in ['sdev', 'bcdev']:
            ds[band] = ds[band] / 10000

    gm_mads = calculate_indices(ds,
                                index=['NDVI', 'LAI', 'MNDWI'],
                                drop=False,
                                normalise=False,
                                collection='s2')

    gm_mads['sdev'] = -np.log(gm_mads['sdev'])
    gm_mads['bcdev'] = -np.log(gm_mads['bcdev'])
    gm_mads['edev'] = -np.log(gm_mads['edev'])

    #rainfall climatology
    if era == '_S1':
        chirps = assign_crs(xr.open_rasterio(
            '/g/data/CHIRPS/cumulative_alltime/CHPclim_jan_jun_cumulative_rainfall.nc'
        ),
                            crs='epsg:4326')

    if era == '_S2':
        chirps = assign_crs(xr.open_rasterio(
            '/g/data/CHIRPS/cumulative_alltime/CHPclim_jul_dec_cumulative_rainfall.nc'
        ),
                            crs='epsg:4326')

    #Clip CHIRPS to ~ S2 tile boundaries so we can handle NaNs local to S2 tile
    xmin, xmax = ds.x.values[0], ds.x.values[-1]
    ymin, ymax = ds.y.values[0], ds.y.values[-1]
    inProj = Proj('epsg:6933')
    outProj = Proj('epsg:4326')
    xmin, ymin = transform(inProj, outProj, xmin, ymin)
    xmax, ymax = transform(inProj, outProj, xmax, ymax)

    #create lat/lon indexing slices - buffer S2 bbox by 0.05deg
    if (xmin < 0) & (xmax < 0):
        x_slice = list(np.arange(xmin + 0.05, xmax - 0.05, -0.05))
    else:
        x_slice = list(np.arange(xmax - 0.05, xmin + 0.05, 0.05))

    if (ymin < 0) & (ymax < 0):
        y_slice = list(np.arange(ymin + 0.05, ymax - 0.05, -0.05))
    else:
        y_slice = list(np.arange(ymin - 0.05, ymax + 0.05, 0.05))

    #index global chirps using buffered s2 tile bbox
    chirps = assign_crs(chirps.sel(x=y_slice, y=x_slice, method='nearest'))

    #fill any NaNs in CHIRPS with local (s2-tile bbox) mean
    chirps = chirps.fillna(chirps.mean())

    #reproject to match satellite data
    chirps = xr_reproject(chirps, ds.geobox, "bilinear")
    gm_mads['rain'] = chirps

    for band in gm_mads.data_vars:
        gm_mads = gm_mads.rename({band: band + era})

    return gm_mads