Esempio n. 1
0
def mask_source_lonlats(source_def, mask):
    """Mask source longitudes and latitudes to match data mask."""
    source_geo_def = source_def

    # the data may have additional masked pixels
    # let's compare them to see if we can use the same area
    # assume lons and lats mask are the same
    if mask is not None and mask is not False and isinstance(source_geo_def, SwathDefinition):
        if np.issubsctype(mask.dtype, np.bool):
            # copy the source area and use it for the rest of the calculations
            LOG.debug("Copying source area to mask invalid dataset points")
            if mask.ndim != source_geo_def.lons.ndim:
                raise ValueError("Can't mask area, mask has different number "
                                 "of dimensions.")

            return SwathDefinition(source_geo_def.lons.where(~mask),
                                   source_geo_def.lats.where(~mask))
        else:
            return SwathDefinition(source_geo_def.lons.where(~xu.isnan(mask)),
                                   source_geo_def.lats.where(~xu.isnan(mask)))

    return source_geo_def
Esempio n. 2
0
def mask_source_lonlats(source_def, mask):
    """Mask source longitudes and latitudes to match data mask."""
    source_geo_def = source_def

    # the data may have additional masked pixels
    # let's compare them to see if we can use the same area
    # assume lons and lats mask are the same
    if mask is not None and mask is not False and isinstance(
            source_geo_def, SwathDefinition):
        if np.issubsctype(mask.dtype, np.bool):
            # copy the source area and use it for the rest of the calculations
            LOG.debug("Copying source area to mask invalid dataset points")
            if mask.ndim != source_geo_def.lons.ndim:
                raise ValueError("Can't mask area, mask has different number "
                                 "of dimensions.")

            return SwathDefinition(source_geo_def.lons.where(~mask),
                                   source_geo_def.lats.where(~mask))
        else:
            return SwathDefinition(source_geo_def.lons.where(~xu.isnan(mask)),
                                   source_geo_def.lats.where(~xu.isnan(mask)))

    return source_geo_def
plt.rcParams.update({'font.size':14})

variable=sys.argv[1]
section=sys.argv[2]

surf_path='/DataArchive/C3S/subsurf_temp'

ds = xr.open_dataset(surf_path+'/Results/'+variable+'_'+section+'_ORCA-0.25x0.25_regular_1979_2018.nc')
var = ds[variable].mean(dim='time').squeeze().rename(variable + r' $[^oC]$')
colors='YlOrRd'
if (section=='A3'):
  xname='longitude'
else:
  xname='latitude'

mask = xu.isnan(var)
fig = plt.figure(1, figsize=(15,8))
ax = fig.add_subplot(211)
p = var.sel(depth=slice(0,1000)).plot.contourf(ax=ax, cmap=colors,vmin=np.nanmin(var), vmax=np.nanmax(var), extend='both', levels=31, cbar_kwargs={'drawedges': True, 'shrink' : 1.})
mask.sel(depth=slice(0,1000)).plot.contour(ax=ax, levels=1, colors='k')
ax.set_xlabel(xname)
ax.invert_yaxis()
ax2 = fig.add_subplot(212)
p2 = var.plot.contourf(ax=ax2, cmap=colors, vmin=np.nanmin(var), vmax=np.nanmax(var), extend='both', levels=31, cbar_kwargs={'drawedges': True, 'shrink' : 1.})
mask.plot.contour(ax=ax2, levels=1, colors='k')
ax2.set_xlabel(xname)
ax2.invert_yaxis()
fig.tight_layout()
fig.savefig(surf_path+'/Figures/'+variable+'_'+section+'mean_ORCA-0.25x0.25_regular_1979_2018.png', transparent=True, dpi=300)

if inormalise:  #mean zero unit (1) standard deviation
    logging.info('normalise data ...')
    datamean = dataxr_filled.mean(dim=('time', 'landpoints'))
    datastd = dataxr_filled.std(dim=('time', 'landpoints'))
    dataxr_lost = (dataxr_lost - datamean) / datastd
    dataxr = (dataxr - datamean) / datastd
    dataxr_filled = (dataxr_filled - datamean) / datastd
    invarmean = timeinvariant.mean(dim=('time', 'landpoints'))
    invarstd = timeinvariant.std(dim=('time', 'landpoints'))
    timeinvariant = (timeinvariant - invarmean) / invarstd

timebeforefill = datetime.now()

# obtain lostmask
lostmask = xu.isnan(
    dataxr_lost
)  # missing (lost + orig missing + ocean) is True, rest is False

# impute data
logging.info(f'impute data with method {fill_method} ...')
if fill_method in ['mean_impute', 'locf']:
    pass
elif fill_method in [
        'missforestmean', 'missforestlocf', 'ridgemean', 'ridgelocf',
        'svdimputemean', 'svdimputelocf', 'gp', 'anchorRegression'
]:

    #  stack var and invar
    logging.info(f'{fill_method}: stack ...')
    ntimesteps = dataxr_filled.coords['time'].size
    timeinvariant = np.repeat(timeinvariant, ntimesteps, axis=1)
    def make_climatology(ds,
                         output_frequency,
                         monthly_weights=False,
                         time_var_name='time',
                         time_dim_name='t_dim',
                         fn_out=None,
                         missing_values=False):
        '''
        Calculates a climatology for all variables in a supplied dataset.
        The resulting xarray dataset will NOT be loaded to RAM. Instead,
        it is a set of dask operations. To load to RAM use, e.g. .compute().
        However, if the original data was large, this may take a long time and
        a lot of memory. Make sure you have the available RAM or chunking
        and parallel processes are specified correctly.
        
        Otherwise, it is recommended that you access the climatology data
        in an indexed way. I.E. compute only at specific parts of the data
        are once.
        
        The resulting cliamtology dataset can be written to disk using
        .to_netcdf(). Again, this may take a while for larger datasets.
        
        ds :: xarray dataset object from a COAsT object.
        output_frequency :: any xarray groupby string. i.e:
            'month'
            'season'
        time_var_name :: the string name of the time variable in dataset
        time_dim_name :: the string name of the time dimension variable in dataset
        fn_out :: string defining full output netcdf file path and name.
        missing_values :: boolean where True indicates the data has missing values 
            that should be ignored. Missing values must be represented by NaNs.
        '''

        frequency_str = time_var_name + '.' + output_frequency
        print('Calculating climatological mean')

        if missing_values:
            ds_mean = xr.Dataset()
            for varname, da in ds.data_vars.items():
                mask = xr.where(uf.isnan(da), 0, 1)
                data = da.groupby(frequency_str).sum(dim=time_dim_name)
                N = mask.groupby(frequency_str).sum(dim=time_dim_name)
                ds_mean[varname] = data / N
        else:
            if monthly_weights:
                month_length = ds[time_var_name].dt.days_in_month
                grouped = month_length.groupby(frequency_str)
            else:
                ds['clim_mean_ones_tmp'] = (time_dim_name,
                                            np.ones(
                                                ds[time_var_name].shape[0]))
                grouped = ds['clim_mean_ones_tmp'].groupby(frequency_str)

            weights = grouped / grouped.sum()
            ds_mean = (ds *
                       weights).groupby(frequency_str).sum(dim=time_dim_name)

            if not monthly_weights:
                ds = ds.drop_vars('clim_mean_ones_tmp')

        if fn_out is not None:
            print('Saving to file. May take some time..')
            with ProgressBar():
                ds_mean.to_netcdf(fn_out)

        return ds_mean

        return
 def phenology_optimized(cube):
     import xarray
     from xarray.ufuncs import fabs,isnan
     import numpy
 
     class Phenology:
         
         """
             sStartDate: First date of the interval for getting season start
             sEndDate: Last date of the interval for getting season start
         
             mStartDate: First date of the interval for getting maximum greenness
             mEndDate: Last date of the interval for getting maximum greenness
         
             eStartDate: First date of the interval for getting season end
             eEndDate: Last date of the interval for getting season end
         
             tSos: The offset (%) to add to the start date minimum to set the start of the season
             tEos: The offset (%) to subtract from the end date minimum to set the end of the season
         """
         def __init__(self,year,tdim,taxis):
             self.year=  year                                 # year of the season, int
             self.tdim = tdim                                 # the name of the time dimension (string)
             self.taxis = taxis                               # the index of the time dimension (int)
             self.sStart=numpy.datetime64(str(year)+'-04-02') # Start date of interval for start of season
             self.sEnd=  numpy.datetime64(str(year)+'-06-10') # End date of tart interval for start of season
             self.mStart=numpy.datetime64(str(year)+'-06-10') # Start date of interval for mid of season
             self.mEnd=  numpy.datetime64(str(year)+'-09-01') # End date of tart interval for mid of season
             self.eStart=numpy.datetime64(str(year)+'-09-01') # Start date of interval for end of season
             self.eEnd=  numpy.datetime64(str(year)+'-12-31') # End date of tart interval for end of season
             self.tSos=  10.                                  # Threshold for start of season
             self.tEos=  10.                                  # Threshold for end of season
 
 
         """
             Calculate the maximum greenness in the mid-season as reference
         """
         def getLocalMax(self,array):
             # Get the local maximum greenness
             seasonMid_Range=array.sel(t=slice(self.mStart,self.mEnd))
             seasonMid_MaxGreennessIdx=seasonMid_Range.argmax('t')
             #seasonMid_DateAtMax=seasonMid_Range.t[seasonMid_MaxGreennessIdx].dt.dayofyear
             seasonMid_MaxGreenness=seasonMid_Range.isel(t=seasonMid_MaxGreennessIdx)
             return seasonMid_MaxGreenness
     
     
         """
             Calculate the start of the season based on selected interval [start, end] and a greenness curve (df). 
             Within this interval we will first look for the local minimum greenness, marked by (dsMin, ysMin). In the
             second step we will use the offset (%) to calculate the amount greenness offset that needs to be applied to 
             the minumum value in order to get the start of the season. This offset is calculated as a percentage of the 
             difference between the maximum greenness and the local minimum.
         """
         def getStartOfSeason(self,array, sMmaxgreen):
             # compute the minimum in the season start
             seasonStart_Range=array.sel(t=slice(self.sStart,self.sEnd))
             seasonStart_MinGreennessIdx=seasonStart_Range.argmin('t')
             seasonStart_DateAtMin,seasonStart_MinGreenness=seasonStart_Range.t[seasonStart_MinGreennessIdx].dt.dayofyear,seasonStart_Range.isel(t=seasonStart_MinGreennessIdx)
             # Calculate the greenness value corresponding to the start of the season
             seasonStart_Greenness = seasonStart_MinGreenness + ((sMmaxgreen - seasonStart_MinGreenness) * (self.tSos / 100.0))
             # Get the closest date to this greenness
             #for i in range(len(seasonStart_Range[:])): seasonStart_Range[i]=seasonStart_Range[i]-seasonStart_Greenness
             seasonStart_Range=fabs(seasonStart_Range-seasonStart_Greenness)
             seasonStart_Idx=seasonStart_Range.where(seasonStart_Range.t.dt.dayofyear>=seasonStart_DateAtMin).argmin('t',skipna=True)
             seasonStart_Date=seasonStart_Range.t[seasonStart_Idx].dt.dayofyear
             return seasonStart_Date
     
     
         """
             Calculate the end of the season based on selected interval [start, end] and a greenness curve (df). 
             Within this interval we will first look for the local minimum greenness, marked by (deMin, yeMin). In the
             second step we will use the offset (%) to calculate the amount greenness offset that needs to be applied to 
             the minumum value in order to get the start of the season. This offset is calculated as a percentage of the 
             difference between the maximum greenness and the local minimum.
         """
         def getEndOfSeason(self, array, sMmaxgreen):
             # compute the minimum in the season start
             seasonEnd_Range=array.sel(t=slice(self.eStart,self.eEnd))
             seasonEnd_MinGreennessIdx=seasonEnd_Range.argmin('t')
             seasonEnd_DateAtMin,seasonEnd_MinGreenness=seasonEnd_Range.t[seasonEnd_MinGreennessIdx].dt.dayofyear,seasonEnd_Range.isel(t=seasonEnd_MinGreennessIdx)
             # Calculate the greenness value corresponding to the start of the season
             seasonEnd_Greenness = seasonEnd_MinGreenness + ((sMmaxgreen - seasonEnd_MinGreenness) * (self.tEos / 100.0))
             # Get the closest date to this greenness
             #for i in range(len(seasonEnd_Range[:])): seasonEnd_Range[i]=seasonEnd_Range[i]-seasonEnd_Greenness
             seasonEnd_Range=fabs(seasonEnd_Range-seasonEnd_Greenness)
             seasonEnd_Idx=seasonEnd_Range.where(seasonEnd_Range.t.dt.dayofyear<=seasonEnd_DateAtMin).argmin('t',skipna=True)
             seasonEnd_Date=seasonEnd_Range.t[seasonEnd_Idx].dt.dayofyear
             return seasonEnd_Date
 
 
     # get the xarray, selecting band zero if multiple bands present    
     # also building bands and t removed metadata
     array=cube.get_array()
     origdims=list(array.dims)
     array=array.isel(bands=range(0,1)).squeeze('bands',drop=True)
     dims=list(array.dims)
     dims.remove('t')
     coords=dict(array.coords)
     coords.pop('t')
     
     # guard agains missing data (Nan's)
     # input data should already be smoothed and interpolated
     missingmask=isnan(array).any('t')
     array=array.fillna(0.)
 
     # run phenology bundle
     year=int((array.t.min()+(array.t.values.max()-array.t.min())/2).dt.year)#int(array.t.dt.year[0])
     pp=Phenology(year,'t',array.dims.index('t')) 
     seasonMid_MaxGreenness=pp.getLocalMax(array)
     seasonStart_Date=pp.getStartOfSeason(array, seasonMid_MaxGreenness)
     seasonEnd_Date=pp.getEndOfSeason(array, seasonMid_MaxGreenness)
 
     # combine results
     seasonStart_Date=xarray.DataArray(seasonStart_Date,dims=dims,coords=coords)
     seasonEnd_Date=xarray.DataArray(seasonEnd_Date,dims=dims,coords=coords)
     season=xarray\
         .concat([seasonStart_Date,seasonEnd_Date],dim='bands')\
         .expand_dims('t',0)\
         .assign_coords(bands=['sos','eos'],t=[numpy.datetime64(str(pp.year)+"-01-01")])\
         .astype(numpy.int16)
     
     # set missing data to 0, exploiting that at this point t,bands are the first two coordinates
     season=season.where(~missingmask,numpy.int16(0))
 
     # set the original order of dimensions
     season=season.transpose(*origdims)
     
     return DataCube(season)
Esempio n. 7
0
import xarray.ufuncs as xu
from interpolation import gapfill_interpolation, remove_ocean_points
from feature_engineering import  create_precip_binary, logscale_precip, create_lat_lon_features,
                                 create_time_feature, creade_embedded_features, stack_constant_maps
                                 normalise, stack
from clusterings import kmeans_clustering
from regression_learning import Imputation
from sklearn.ensemble import RandomForestRegressor
from postproc import exp_precip, renormalise, unstack

# load data
data = xr.open_dataset('/path/to/gappy/dataset')
constant_maps = xr.open_dataset('/path/to/constant/maps')

# create mask of missing values
mask = xu.isnan(data) 

# get list of variables
variables = data.coords['variables'].values

# step 1: interpolation
data = gapfill_interpolation(data)

# optional: save interpolation result for comparison

# optional: remove ocean points for reducing file size
landmask = xr.open_dataset('/path/to/landmask') # needs dims 'latitude' and 'longitude'
data = remove_ocean_points(data, landmask)
mask = remove_ocean_points(mask, landmask)

# step 2: feature engineering
Esempio n. 8
0
def zero_missing_data(data1, data2):
    """Replace NaN values with zeros in data1 if the data is valid in data2."""
    nans = xu.logical_and(xu.isnan(data1), xu.logical_not(xu.isnan(data2)))
    return data1.where(~nans, 0)
Esempio n. 9
0
    landmask: boolean xarray dataarray with only the coordinates latitude and longitude, where grid points on land are True and grid points in the ocean (or regions that are not relevant for research) are False

    Returns
    ----------
    imputed_data: data of the same shape as input data, where all values that were not missing are still the same and all values that were originally missing are imputed via spatiotemporal mean
    """
    landlat, landlon = np.where(landmask)
    return data.isel(longitude=xr.DataArray(landlon, dims='landpoints'),
                     latitude=xr.DataArray(landlat, dims='landpoints'))


if __name__ == '__main__':

    data = xr.open_dataset('/path/to/gappy/dataset')
    log_fracmis(data, 'after reading file')

    mask = xu.isnan(data)  # create mask of missing values

    data = gapfill_interpolation(
        data)  # initial gapfill all missing values with interpolation
    log_fracmis(data, 'after interpolation')  # should be zero

    # optional: remove ocean points for reducing file size
    landmask = xr.open_dataset(
        '/path/to/landmask')  # needs dims 'latitude' and 'longitude'
    data = remove_ocean_points(data, landmask)
    mask = remove_ocean_points(mask, landmask)

    # data.to_netcdf ...
    # mask.to_netcdf ...
Esempio n. 10
0
for pp in range(0, n_port):
    tg_tmp = tg.isel(port=pp)

    a0, g0, af, gf = do_analysis(tg_tmp.time, tg_tmp.ssh, const,
                                 tg_tmp.latitude.values)
    a_10y[pp, 0] = a0
    g_10y[pp, 0] = g0
    a_10y[pp, 1] = af
    g_10y[pp, 1] = gf

    for dd in range(0, n_1m - 1):

        tg_m = tg_tmp.sel(time=slice(dates_1m[dd], dates_1m[dd + 1]))

        if len(np.where(uf.isnan(tg_m.ssh))[0]) > tg_m.dims['time'] / 5:
            continue
        a0, g0, af, gf = do_analysis(tg_m.time, tg_m.ssh, const,
                                     tg_m.latitude.values)
        a_1m[pp, 0, dd] = a0
        g_1m[pp, 0, dd] = g0
        a_1m[pp, 1, dd] = af
        g_1m[pp, 1, dd] = gf

    for dd in range(0, n_3m - 1):

        tg_m = tg_tmp.sel(time=slice(dates_3m[dd], dates_3m[dd + 1]))

        if len(np.where(uf.isnan(tg_m.ssh))[0]) > tg_m.dims['time'] / 5:
            continue
        a0, g0, af, gf = do_analysis(tg_m.time, tg_m.ssh, const,