Example #1
0
clip = gpd.read_file('data/eastern_south.shp')
input_data = gpd.overlay(input_data,clip, how='intersection').reset_index()


#-------------------------------------------
#predicting using annual MADs and pre-computed other feature saved to disk

# load the column_names
with open(training_data, 'r') as file:
    header = file.readline()
    
column_names = header.split()[1:]
 #load data for calculating annual gm+mads
    with HiddenPrints():
        ds = load_ard(dc=dc,
                  products=products,
                  dask_chunks=dask_chunks,
                  **query)

    ds = ds / 10000
    
    #compute annual tmads (requires computing annual gm)
    mads = xr_geomedian_tmad(ds).compute()
    mads['sdev'] = -np.log(mads['sdev'])
    mads['bcdev'] = -np.log(mads['bcdev'])
    mads['edev'] = -np.log(mads['edev'])
    mads = mads[['sdev','bcdev','edev']] #drop gm
    mads.to_netcdf(results+ 'input/annual_mads/Eastern_tile_'+g_id+'_annual_mads.nc')
    mads = mads.chunk(dask_chunks)
    
    #load other data -
    data = xr.open_dataset(results+'input/Eastern_tile_'+g_id+'_inputs.nc').chunk(dask_chunks)
def get_training_data_for_shp(gdf,
                              index,
                              row,
                              out_arrs,
                              out_vars,
                              products,
                              dc_query,
                              custom_func=None,
                              field=None,
                              calc_indices=None,
                              reduce_func=None,
                              drop=True,
                              zonal_stats=None):
    """
    Function to extract data from the ODC for training a machine learning classifier 
    using a geopandas geodataframe of labelled geometries. 
    This function provides a number of pre-defined methods for producing training data, 
    including calcuating band indices, reducing time series using several summary statistics, 
    and/or generating zonal statistics across polygons.  The 'custom_func' parameter provides 
    a method for the user to supply a custom function for generating features rather than using the
    pre-defined methods.

    Parameters
    ----------
    gdf : geopandas geodataframe
        geometry data in the form of a geopandas geodataframe
    products : list
        a list of products to load from the datacube. 
        e.g. ['ls8_usgs_sr_scene', 'ls7_usgs_sr_scene']
    dc_query : dictionary
        Datacube query object, should not contain lat and long (x or y)
        variables as these are supplied by the 'gdf' variable
    field : string 
        A string containing the name of column with class labels. 
        Field must contain numeric values.
    out_arrs : list 
        An empty list into which the training data arrays are stored.
    out_vars : list 
        An empty list into which the data varaible names are stored.
    custom_func : function, optional 
        A custom function for generating feature layers. If this parameter
        is set, all other options (excluding 'zonal_stats'), will be ignored.
        The result of the 'custom_func' must be a single xarray dataset 
        containing 2D coordinates (i.e x, y - no time dimension). The custom function
        has access to the datacube dataset extracted using the 'dc_query' params,
        along with access to the 'dc_query' dictionary itself, which could be used
        to load other products besides those specified under 'products'.
    calc_indices: list, optional
        If not using a custom func, then this parameter provides a method for
        calculating a number of remote sensing indices (e.g. `['NDWI', 'NDVI']`).
    reduce_func : string, optional 
        Function to reduce the data from multiple time steps to
        a single timestep. Options are 'mean', 'median', 'std',
        'max', 'min', 'geomedian'.  Ignored if 'custom_func' is provided.
    drop : boolean, optional , 
        If this variable is set to True, and 'calc_indices' are supplied, the
        spectral bands will be dropped from the dataset leaving only the
        band indices as data variables in the dataset. Default is True.
    zonal_stats : string, optional
        An optional string giving the names of zonal statistics to calculate 
        for each polygon. Default is None (all pixel values are returned). Supported 
        values are 'mean', 'median', 'max', 'min', and 'std'. Will work in 
        conjuction with a 'custom_func'.


    Returns
    --------
    Two lists, a list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon, and another containing the data variable names.

    """

    # prevent function altering dictionary kwargs
    dc_query = deepcopy(dc_query)

    # remove dask chunks if supplied as using
    # mulitprocessing for parallization
    if 'dask_chunks' in dc_query.keys():
        dc_query.pop('dask_chunks', None)

    # connect to datacube
    dc = datacube.Datacube(app='training_data')

    # set up query based on polygon (convert to WGS84)
    geom = geometry.Geometry(gdf.geometry.values[index].__geo_interface__,
                             geometry.CRS('epsg:4326'))

    # print(geom)
    q = {"geopolygon": geom}

    # merge polygon query with user supplied query params
    dc_query.update(q)

    # Identify the most common projection system in the input query
    output_crs = mostcommon_crs(dc=dc, product=products, query=dc_query)

    # load_ard doesn't handle geomedians
    # TODO: Add support for other sensors
    if 'ga_ls8c_gm_2_annual' in products:
        ds = dc.load(product='ga_ls8c_gm_2_annual', **dc_query)
        ds = ds.where(ds != 0, np.nan)

    else:
        # load data
        with HiddenPrints():
            ds = load_ard(dc=dc,
                          products=products,
                          output_crs=output_crs,
                          **dc_query)

    # create polygon mask
    with HiddenPrints():
        mask = xr_rasterize(gdf.iloc[[index]], ds)

    # mask dataset
    ds = ds.where(mask)

    # Use custom function for training data if it exists
    if custom_func is not None:
        with HiddenPrints():
            data = custom_func(ds)

    else:
        # first check enough variables are set to run functions
        if (len(ds.time.values) > 1) and (reduce_func == None):
            raise ValueError(
                "You're dataset has " + str(len(ds.time.values)) +
                " time-steps, please provide a reduction function," +
                " e.g. reduce_func='mean'")

        if calc_indices is not None:
            # determine which collection is being loaded
            if 'level2' in products[0]:
                collection = 'c2'
            elif 'gm' in products[0]:
                collection = 'c2'
            elif 'sr' in products[0]:
                collection = 'c1'
            elif 's2' in products[0]:
                collection = 's2'

            if len(ds.time.values) > 1:

                if reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)
                        # getattr is equivalent to calling data.reduce_func
                        method_to_call = getattr(data, reduce_func)
                        data = method_to_call(dim='time')

                elif reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)
                    with HiddenPrints():
                        data = calculate_indices(data,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                else:
                    raise Exception(
                        reduce_func + " is not one of the supported" +
                        " reduce functions ('mean','median','std','max','min', 'geomedian')"
                    )

            else:
                with HiddenPrints():
                    data = calculate_indices(ds,
                                             index=calc_indices,
                                             drop=drop,
                                             collection=collection)

        # when band indices are not required, reduce the
        # dataset to a 2d array through means or (geo)medians
        if calc_indices is None:

            if len(ds.time.values) > 1:

                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)

                elif reduce_func in ['mean', 'median', 'std', 'max', 'min']:
                    method_to_call = getattr(ds, reduce_func)
                    data = method_to_call('time')
            else:
                data = ds.squeeze()

    if zonal_stats is None:
        # If no zonal stats were requested then extract all pixel values
        flat_train = sklearn_flatten(data)
        # Make a labelled array of identical size
        flat_val = np.repeat(row[field], flat_train.shape[0])
        stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))

    elif zonal_stats in ['mean', 'median', 'std', 'max', 'min']:
        method_to_call = getattr(data, zonal_stats)
        flat_train = method_to_call()
        flat_train = flat_train.to_array()
        stacked = np.hstack((row[field], flat_train))

    else:
        raise Exception(
            zonal_stats + " is not one of the supported" +
            " reduce functions ('mean','median','std','max','min')")

    # Append training data and labels to list
    out_arrs.append(stacked)
    out_vars.append([field] + list(data.data_vars))
Example #3
0
def get_training_data_for_shp(polygons,
                              out,
                              products,
                              dc_query,
                              field=None,
                              calc_indices=None,
                              reduce_func='median',
                              drop=True,
                              zonal_stats=None,
                              collection='c1'):
    """
    Function to extract data for training a classifier using a shapefile 
    of labelled polygons.

    Parameters
    ----------
    polygons : geopandas geodataframe
        polygon data in the form of a geopandas geodataframe
    out : list
        Empty list to contain output data.
    products : list
        a list of products ot load from the datacube. 
        e.g. ['ls8_usgs_sr_scene', 'ls7_usgs_sr_scene']
    dc_query : dictionary
        Datacube query object, should not contain lat and long (x or y)
        variables as these are supplied by the 'polygons' variable
    field : string 
        A string containing name of column with labels in shapefile 
        attribute table. Field must contain numeric values.
    calc_indices: list, optional
        An optional list giving the names of any remote sensing indices 
        to be calculated on the loaded data (e.g. `['NDWI', 'NDVI']`. 
    reduce_func : string, optional 
        Function to reduce the data from multiple time steps to
        a single timestep. Options are 'mean'
    drop : booleam, optional , 'median', or 'geomedian'
        If this variable is set to True, and 'calc_indices' are supplied, the
        spectral bands will be dropped from the dataset leaving only the
        band indices as data variables in the dataset. Default is False.
    zonal_stats: string, optional
        An optional string giving the names of zonal statistics to calculate 
        for the polygon. Default is None (all pixel values). Supported 
        values are 'mean' or 'median' 
    collection: string, optional
        to calculate band indices, the satellite collection is required.
        Options include 'c1' for Landsat C1, 'c2' for Landsat C2, and 
        's2' for Sentinel 2.

    Returns
    --------
    A list of numpy.arrays containing classes and extracted data for 
    each pixel or polygon.

    """
    #prevent function altering dictionary kwargs
    dc_query = deepcopy(dc_query)
    dc = datacube.Datacube(app='training_data')

    #set up some print statements
    i = 0
    if calc_indices is not None:
        print("Calculating indices: " + str(calc_indices))
    if reduce_func is not None:
        print("Reducing data using: " + reduce_func)
    if zonal_stats is not None:
        print("Taking zonal statistic: " + zonal_stats)

    # loop through polys and extract training data
    for index, row in polygons.iterrows():
        print(" Feature {:04}/{:04}\r".format(i + 1, len(polygons)), end='')

        # set up query based on polygon (convert to WGS84)
        geom = geometry.Geometry(polygons.geometry.values[0].__geo_interface__,
                                 geometry.CRS('epsg:4326'))

        q = {"geopolygon": geom}

        # merge polygon query with user supplied query params
        dc_query.update(q)

        # Identify the most common projection system in the input query
        output_crs = mostcommon_crs(dc=dc, product=products, query=dc_query)

        #load_ard doesn't handle geomedians
        if 'ga_ls8c_gm_2_annual' in products:
            ds = dc.load(product='ga_ls8c_gm_2_annual', **dc_query)

        else:
            # load data
            with HiddenPrints():
                ds = load_ard(dc=dc,
                              products=products,
                              output_crs=output_crs,
                              **dc_query)

        # create polygon mask
        mask = rasterio.features.geometry_mask(
            [geom.to_crs(ds.geobox.crs) for geoms in [geom]],
            out_shape=ds.geobox.shape,
            transform=ds.geobox.affine,
            all_touched=False,
            invert=False)

        mask = xr.DataArray(mask, dims=("y", "x"))
        ds = ds.where(mask == False)

        # Check if band indices are wanted
        if calc_indices is not None:

            if len(ds.time.values) > 1:

                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)
                    with HiddenPrints():
                        data = calculate_indices(data,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                elif reduce_func == 'std':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)
                    data = data.std('time')

                elif reduce_func == 'mean':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                    data = data.mean('time')

                elif reduce_func == 'median':
                    with HiddenPrints():
                        data = calculate_indices(ds,
                                                 index=calc_indices,
                                                 drop=drop,
                                                 collection=collection)

                    data = data.median('time')
            else:
                with HiddenPrints():
                    data = calculate_indices(ds,
                                             index=calc_indices,
                                             drop=drop,
                                             collection=collection)

        # when band indices are not required, reduce the
        # dataset to a 2d array through means or (geo)medians
        if calc_indices is None:
            if (len(ds.time.values) > 1) and (reduce_func == None):
                raise ValueError(
                    "You're dataset has " + str(len(ds.time.values)) +
                    "time-steps, please provide a reduction function, e.g. reduce_func='mean'"
                )

            if len(ds.time.values) > 1:
                if reduce_func == 'geomedian':
                    data = GeoMedian().compute(ds)

                if reduce_func == 'mean':
                    data = ds.mean('time')

                if reduce_func == 'std':
                    data = ds.std('time')

                if reduce_func == 'median':
                    data = ds.median('time')

            else:
                data = ds.squeeze()

        # compute in case we have dask arrays
        data = data.compute()

        if zonal_stats is None:
            # If no summary stats were requested then extract all pixel values
            flat_train = sklearn_flatten(data)
            # Make a labelled array of identical size
            flat_val = np.repeat(row[field], flat_train.shape[0])
            stacked = np.hstack((np.expand_dims(flat_val, axis=1), flat_train))

        elif zonal_stats == 'mean':
            flat_train = data.mean(axis=None, skipna=True)
            flat_train = flat_train.to_array()
            stacked = np.hstack((row[field], flat_train))

        elif zonal_stats == 'median':
            flat_train = data.median(axis=None, skipna=True)
            flat_train = flat_train.to_array()
            stacked = np.hstack((row[field], flat_train))

        # Append training data and label to list
        out.append(stacked)
        i += 1
    # Return a list of labels for columns in output array

    return [field] + list(data.data_vars)
Example #4
0
def run_filmstrip_app(output_name,
                      time_range,
                      time_step,
                      tide_range=(0.0, 1.0),
                      resolution=(-30, 30),
                      max_cloud=0.5,
                      ls7_slc_off=False,
                      size_limit=10000):
    '''
    An interactive app that allows the user to select a region from a
    map, then load Digital Earth Africa Landsat data and combine it
    using the geometric median ("geomedian") statistic to reveal the 
    median or 'typical' appearance of the landscape for a series of 
    time periods.
    
    The results for each time period are combined into a 'filmstrip' 
    plot which visualises how the landscape has changed in appearance 
    across time, with a 'change heatmap' panel highlighting potential 
    areas of greatest change.
    
    For coastal applications, the analysis can be customised to select 
    only satellite images obtained during a specific tidal range 
    (e.g. low, average or high tide).
    
    Last modified: April 2020

    Parameters
    ----------  
    output_name : str
        A name that will be used to name the output filmstrip plot file.
    time_range : tuple
        A tuple giving the date range to analyse 
        (e.g. `time_range = ('1988-01-01', '2017-12-31')`).
    time_step : dict
        This parameter sets the length of the time periods to compare 
        (e.g. `time_step = {'years': 5}` will generate one filmstrip 
        plot for every five years of data; `time_step = {'months': 18}` 
        will generate one plot for each 18 month period etc. Time 
        periods are counted from the first value given in `time_range`.
    tide_range : tuple, optional
        An optional parameter that can be used to generate filmstrip 
        plots based on specific ocean tide conditions. This can be 
        valuable for analysing change consistently along the coast. 
        For example, `tide_range = (0.0, 0.2)` will select only 
        satellite images acquired at the lowest 20% of tides; 
        `tide_range = (0.8, 1.0)` will select images from the highest 
        20% of tides. The default is `tide_range = (0.0, 1.0)` which 
        will select all images regardless of tide.
    resolution : tuple, optional
        The spatial resolution to load data. The default is 
        `resolution = (-30, 30)`, which will load data at 30 m pixel 
        resolution. Increasing this (e.g. to `resolution = (-100, 100)`) 
        can be useful for loading large spatial extents.
    max_cloud : float, optional
        This parameter can be used to exclude satellite images with 
        excessive cloud. The default is `0.5`, which will keep all images 
        with less than 50% cloud.
    ls7_slc_off : bool, optional
        An optional boolean indicating whether to include data from 
        after the Landsat 7 SLC failure (i.e. SLC-off). Defaults to 
        False, which removes all Landsat 7 observations > May 31 2003.
    size_limit : int, optional
        An optional integer (in hectares) specifying the size limit 
        for the data query. Queries larger than this size will receive
        a warning that he data query is too large (and may
        therefore result in memory errors).
        
        
    Returns
    -------
    ds_geomedian : xarray Dataset
        An xarray dataset containing geomedian composites for each 
        timestep in the analysis.
        
    '''

    ########################
    # Select and load data #
    ########################

    # Define centre_coords as a global variable
    global centre_coords

    # Test if centre_coords is in the global namespace;
    # use default value if it isn't
    if 'centre_coords' not in globals():
        centre_coords = (6.587292, 1.532833)

    # Plot interactive map to select area
    basemap = basemap_to_tiles(basemaps.Esri.WorldImagery)
    geopolygon = select_on_a_map(height='600px',
                                 layers=(basemap, ),
                                 center=centre_coords,
                                 zoom=14)

    # Set centre coords based on most recent selection to re-focus
    # subsequent data selections
    centre_coords = geopolygon.centroid.points[0][::-1]

    # Test size of selected area
    msq_per_hectare = 10000
    area = (geopolygon.to_crs(crs=CRS('epsg:6933')).area / msq_per_hectare)
    radius = np.round(np.sqrt(size_limit), 1)
    if area > size_limit:
        print(f'Warning: Your selected area is {area:.00f} hectares. '
              f'Please select an area of less than {size_limit} hectares.'
              f'\nTo select a smaller area, re-run the cell '
              f'above and draw a new polygon.')

    else:

        print('Starting analysis...')

        # Connect to datacube database
        dc = datacube.Datacube(app='Change_filmstrips')

        # Configure local dask cluster
        create_local_dask_cluster()

        # Obtain native CRS
        crs = mostcommon_crs(dc=dc,
                             product='ls5_usgs_sr_scene',
                             query={
                                 'time': '1990',
                                 'geopolygon': geopolygon
                             })

        # Create query based on time range, area selected, custom params
        query = {
            'time': time_range,
            'geopolygon': geopolygon,
            'output_crs': crs,
            'resolution': resolution,
            'dask_chunks': {
                'x': 3000,
                'y': 3000
            },
            'align': (resolution[1] / 2.0, resolution[1] / 2.0)
        }

        # Load data from all three Landsats
        warnings.filterwarnings("ignore")
        ds = load_ard(dc=dc,
                      measurements=['red', 'green', 'blue'],
                      products=[
                          'ls5_usgs_sr_scene', 'ls7_usgs_sr_scene',
                          'ls8_usgs_sr_scene'
                      ],
                      min_gooddata=max_cloud,
                      ls7_slc_off=ls7_slc_off,
                      **query)

        # Optionally calculate tides for each timestep in the satellite
        # dataset and drop any observations out side this range
        if tide_range != (0.0, 1.0):
            ds = tidal_tag(ds=ds, tidepost_lat=None, tidepost_lon=None)
            min_tide, max_tide = ds.tide_height.quantile(tide_range).values
            ds = ds.sel(time=(ds.tide_height >= min_tide)
                        & (ds.tide_height <= max_tide))
            ds = ds.drop('tide_height')
            print(f'    Keeping {len(ds.time)} observations with tides '
                  f'between {min_tide:.2f} and {max_tide:.2f} m')

        # Create time step ranges to generate filmstrips from
        bins_dt = pd.date_range(start=time_range[0],
                                end=time_range[1],
                                freq=pd.DateOffset(**time_step))

        # Bin all satellite observations by timestep. If some observations
        # fall outside the upper bin, label these with the highest bin
        labels = bins_dt.astype('str')
        time_steps = (pd.cut(ds.time.values, bins_dt,
                             labels=labels[:-1]).add_categories(
                                 labels[-1]).fillna(labels[-1]))
        time_steps_var = xr.DataArray(time_steps, [('time', ds.time)],
                                      name='timestep')

        # Resample data temporally into time steps, and compute geomedians
        ds_geomedian = (
            ds.groupby(time_steps_var).apply(lambda ds_subset: xr_geomedian(
                ds_subset,
                num_threads=
                1,  # disable internal threading, dask will run several concurrently
                eps=0.2 * (1 / 10_000),  # 1/5 pixel value resolution
                nocheck=True))
        )  # disable some checks inside geomedian library that use too much ram
Example #5
0
def load_crophealth_data(lat, lon, buffer):
    """
    Loads Landsat 8 analysis-ready data (ARD) product for the crop health
    case-study area over the last two years.
    Last modified: April 2020
    
    Parameters
    ----------
    lat: float
        The central latitude to analyse
    lon: float
        The central longitude to analyse
    buffer:
         The number of square degrees to load around the central latitude and longitude. 
         For reasonable loading times, set this as `0.1` or lower.

    Returns
    ----------
    ds: xarray.Dataset 
        data set containing combined, masked data
        Masked values are set to 'nan'
    """
    
    # Suppress warnings
    warnings.filterwarnings('ignore')

    # Initialise the data cube. 'app' argument is used to identify this app
    dc = datacube.Datacube(app='Crophealth-app')
    
    # Define area to load
    latitude = (lat - buffer, lat + buffer)
    longitude = (lon - buffer, lon + buffer)

    # Specify the date range
    # Calculated as today's date, subtract 730 days to collect two years of data
    # Dates are converted to strings as required by loading function below
    end_date = dt.date.today()
    start_date = end_date - dt.timedelta(days=730)

    time = (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

    # Construct the data cube query
    products = ["ls8_usgs_sr_scene"]
    
    query = {
        'x': longitude,
        'y': latitude,
        'time': time,
                'measurements': [
            'red',
            'green',
            'blue',
            'nir',
            'swir2'
        ],
        'output_crs': 'EPSG:6933',
        'resolution': (-30, 30)
    }

    # Load the data and mask out bad quality pixels
    ds = load_ard(dc, products=products, min_gooddata=0.5, **query)

    # Calculate the normalised difference vegetation index (NDVI) across
    # all pixels for each image.
    # This is stored as an attribute of the data
    ds = calculate_indices(ds, index='NDVI', collection='s2')

    # Return the data
    return(ds)
Example #6
0
def WIT_drill(gdf_poly,
              time,
              min_gooddata=0.80,
              TCW_threshold=-6000,
              export_csv=None,
              dask_chunks=None):
    """
    The Wetlands Insight Tool. This function loads FC, WOfS, Landsat-ARD,
    and calculate tasseled cap wetness, in order to determine the dominant
    land cover class within a polygon at each satellite observation.

    The output is a pandas dataframe containing a timeseries of the relative
    fractions of each class at each time-step. This forms the input to produce
    a stacked line-plot.

    Last modified: Feb 2020

    Parameters
    ----------  
    gdf_poly : geopandas.GeoDataFrame
        The dataframe must only contain a single row,
        containing the polygon you wish to interrograte.
    time : tuple
        a tuple containing the time range over which to run the WIT.
        e.g. ('2015-01' , '2019-12')
    min_gooddata : Float, optional
        A number between 0 and 1 (e.g 0.8) indicating the minimum percentage
        of good quality pixels required for a satellite observation to be loaded
        and therefore included in the WIT plot.  Defaults to 0.8, which should
        be considered a minimum percentage.
    TCW_threshold : Int, optional
        The tasseled cap wetness threshold, beyond which a pixel will be 
        considered 'wet'. Defaults to -6000. Consider the surface reflectance
        scaling of the Landsat product when adjusting this (C2 = 1-65,535) 
    export_csv : str, optional
        To export the returned pandas dataframe provide
        a location string (e.g. 'output/results.csv')
    dask_chunks : dict, optional
        To lazily load the datasets using dask, pass a dictionary containing
        the dimensions over which to chunk e.g. {'time':-1, 'x':250, 'y':250}.
        The function is not currently set up to handle dask arrays very well, so
        memory efficieny using dask will be of limited use here.
        
    Returns
    -------
    PolyDrill_df : Pandas.Dataframe
        A pandas dataframe containing the timeseries of relative fractions
        of each land cover class (WOfs, FC, TCW) 

    """

    print("working on polygon: " +
          str(gdf_poly.drop('geometry', axis=1).values) + ".  ")

    # make quaery from polygon
    geom = geometry.Geometry(gdf_poly.geometry.values[0].__geo_interface__,
                             geometry.CRS("epsg:4326"))
    query = {"geopolygon": geom, "time": time}

    # set Sandbox configs to load COG's faster
    datacube.utils.rio.set_default_rio_config(aws="auto", cloud_defaults=True)

    # Create a datacube instance
    dc = datacube.Datacube(app="wetlands insight tool")

    # find UTM crs for location
    crs = deafrica_datahandling.mostcommon_crs(dc=dc,
                                               product="usgs_ls8c_level2_2",
                                               query=query)

    # load landsat 5,7,8 data
    ls578_ds = deafrica_datahandling.load_ard(
        dc=dc,
        products=["usgs_ls8c_level2_2"],
        output_crs=crs,
        min_gooddata=min_gooddata,
        measurements=["red", "green", "blue", "nir", "swir_1", "swir_2"],
        align=(15, 15),
        dask_chunks=dask_chunks,
        group_by='solar_day',
        resolution=(-30, 30),
        **query,
    )

    # mask the data with our original polygon to remove extra data
    data = ls578_ds
    mask = rasterio.features.geometry_mask(
        [geom.to_crs(data.geobox.crs) for geoms in [geom]],
        out_shape=data.geobox.shape,
        transform=data.geobox.affine,
        all_touched=False,
        invert=False,
    )

    # mask the data with the polygon
    mask_xr = xr.DataArray(mask, dims=("y", "x"))
    ls578_ds = data.where(mask_xr == False)
    print("size of wetlands array: " +
          str(ls578_ds.isel(time=1).red.values.shape))

    ls578_ds = ls578_ds.compute()

    # calculate tasselled cap wetness within masked AOI
    print("calculating tasseled cap index ")
    tci = thresholded_tasseled_cap(ls578_ds,
                                   wetness_threshold=TCW_threshold,
                                   drop=True,
                                   drop_tc_bands=True)
    # select only finite values (over threshold values)
    tcw = xr.ufuncs.isfinite(tci.wetness_thresholded)
    # #reapply the polygon mask
    tcw = tcw.where(mask_xr == False)

    print("Loading WOfS layers ")
    wofls = dc.load(
        product="ga_ls8c_wofs_2",
        like=ls578_ds,
        fuse_func=wofs_fuser,
        dask_chunks=dask_chunks,
    )
    wofls = wofls.where(wofls.time == tcw.time)
    # #reapply the polygon mask
    wofls = wofls.where(mask_xr == False)
    wofls = wofls.compute()

    wet_wofs = wofls.where(wofls.water == 128)

    # use bit values for wet (128) and terrain/low-angle (8)
    shadow_wofs = wofls.where(wofls.water == 136)
    # bit values for wet (128) and sea (4)
    sea_wofs = wofls.where(wofls.water == 132)
    # bit values for wet (128) and sea (4) and terrain/low-angle (8)
    sea_shadow_wofs = wofls.where(wofls.water == 140)

    # load Fractional cover
    print("Loading fractional Cover")
    # load fractional cover
    fc_ds = dc.load(
        product="ga_ls8c_fractional_cover_2",
        dask_chunks=dask_chunks,
        like=ls578_ds,
        measurements=["pv", "npv", "bs"],
    )
    # use landsat data set to cloud mask FC
    fc_ds = fc_ds.where(ls578_ds.red)

    # mask with polygon
    fc_ds = fc_ds.where(mask_xr == False)
    fc_ds = fc_ds.compute()

    fc_ds_noTCW = fc_ds.where(tcw == False)

    print("Generating classification")
    # match timesteps
    fc_ds_noTCW = fc_ds_noTCW.where(fc_ds_noTCW.time == tcw.time)

    # following robbi's advice, cast the dataset to a dataarray
    maxFC = fc_ds_noTCW.to_array(dim="variable", name="maxFC")

    # turn FC array into integer only as nanargmax doesn't seem to handle floats the way we want it to
    FC_int = maxFC.astype("int8")

    # use numpy.nanargmax to get the index of the maximum value along the variable dimension
    # BSPVNPV=np.nanargmax(FC_int, axis=0)
    BSPVNPV = FC_int.argmax(dim="variable")

    FC_mask = xr.ufuncs.isfinite(maxFC).all(dim="variable")

    # #re-mask with nans to remove no-data
    BSPVNPV = BSPVNPV.where(FC_mask)
    # restack the Fractional cover dataset all together
    # CAUTION:ARGMAX DEPENDS ON ORDER OF VARIABALES IN
    # DATASET, THESE WILL BE DIFFERENT FOR DIFFERENT COLLECTIONS.
    # NEED TO ADJUST 0,1,2 BELOW DEPENDING ON ORDER OF FC VARIABLES
    # IN THE DATASET.
    FC_dominant = xr.Dataset({
        "BS": (BSPVNPV == 2).where(FC_mask),
        "PV": (BSPVNPV == 0).where(FC_mask),
        "NPV": (BSPVNPV == 1).where(FC_mask),
    })
    # count number of Fractional Cover pixels for each cover type in area of interest
    FC_count = FC_dominant.sum(dim=["x", "y"])

    # number of pixels in area of interest
    pixels = (mask_xr == 0).sum(dim=["x", "y"])

    # count number of tcw pixels
    tcw_pixel_count = tcw.sum(dim=["x", "y"])

    #     return FC_dominant, FC_mask, BSPVNPV, fc_ds, ls578_ds
    # number of pixels in area of interest
    pixels = (mask_xr == 0).sum(dim=["x", "y"])

    wofs_pixels = (wet_wofs.water.count(dim=["x", "y"]) +
                   shadow_wofs.water.count(dim=["x", "y"]) +
                   sea_wofs.water.count(dim=["x", "y"]) +
                   sea_shadow_wofs.water.count(dim=["x", "y"]))

    # count percentage of area of wofs
    wofs_area_percent = (wofs_pixels / pixels) * 100

    # count number of tcw pixels
    tcw_pixel_count = tcw.sum(dim=["x", "y"])

    # calculate percentage area wet
    tcw_area_percent = (tcw_pixel_count / pixels) * 100

    # calculate wet not wofs
    tcw_less_wofs = tcw_area_percent - wofs_area_percent

    # Fractional cover pixel count method
    # Get number of FC pixels, divide by total number of pixels per polygon
    # Work out the number of nodata pixels in the data, so that we can graph the variables by number of observed pixels.
    Bare_soil_percent = (FC_count.BS / pixels) * 100
    Photosynthetic_veg_percent = (FC_count.PV / pixels) * 100
    NonPhotosynthetic_veg_percent = (FC_count.NPV / pixels) * 100
    NoData = (100 - wofs_area_percent - tcw_less_wofs -
              Photosynthetic_veg_percent - NonPhotosynthetic_veg_percent -
              Bare_soil_percent)
    NoDataPixels = (NoData / 100) * pixels

    # Fractional cover pixel count method
    # Get number of FC pixels, divide by total number of pixels per polygon
    Bare_soil_percent2 = (FC_count.BS / (pixels - NoDataPixels)) * 100
    Photosynthetic_veg_percent2 = (FC_count.PV / (pixels - NoDataPixels)) * 100
    NonPhotosynthetic_veg_percent2 = (FC_count.NPV /
                                      (pixels - NoDataPixels)) * 100

    # count percentage of area of wofs
    wofs_area_percent2 = (wofs_pixels / (pixels - NoDataPixels)) * 100
    # wofs_area_percent
    wofs_area_percent = (wofs_pixels / pixels) * 100
    # count number of tcw pixels
    tcw_pixel_count2 = tcw.sum(dim=["x", "y"])

    # calculate percentage area wet
    tcw_area_percent2 = (tcw_pixel_count2 / (pixels - NoDataPixels)) * 100

    # calculate wet not wofs
    tcw_less_wofs2 = tcw_area_percent2 - wofs_area_percent2

    # last check for timestep matching before we plot
    wofs_area_percent2 = wofs_area_percent2.where(
        wofs_area_percent2.time == Bare_soil_percent2.time)
    Bare_soil_percent2 = Bare_soil_percent2.where(
        Bare_soil_percent2.time == wofs_area_percent2.time)
    Photosynthetic_veg_percent2 = Photosynthetic_veg_percent2.where(
        Photosynthetic_veg_percent2.time == wofs_area_percent2.time)
    NonPhotosynthetic_veg_percent2 = NonPhotosynthetic_veg_percent2.where(
        NonPhotosynthetic_veg_percent2.time == wofs_area_percent2.time)

    # start setup of dataframe by adding only one dataset
    WOFS_df = pd.DataFrame(
        data=wofs_area_percent2.data,
        index=wofs_area_percent2.time.values,
        columns=["wofs_area_percent"],
    )

    # add data into pandas dataframe for export
    WOFS_df["wet_percent"] = tcw_less_wofs2.data
    WOFS_df["green_veg_percent"] = Photosynthetic_veg_percent2.data
    WOFS_df["dry_veg_percent"] = NonPhotosynthetic_veg_percent2.data
    WOFS_df["bare_soil_percent"] = Bare_soil_percent2.data

    # call the composite dataframe something sensible, like PolyDrill
    PolyDrill_df = WOFS_df.round(2)

    # save the csv of the output data used to create the stacked plot for the polygon drill
    if export_csv:
        print('exporting csv: ' + export_csv)
        PolyDrill_df.to_csv(export_csv, index_label="Datetime")

    ls578_ds = None
    data = None
    fc_ds = None
    wofls = None
    tci = None

    return PolyDrill_df