def mask_source_lonlats(source_def, mask): """Mask source longitudes and latitudes to match data mask.""" source_geo_def = source_def # the data may have additional masked pixels # let's compare them to see if we can use the same area # assume lons and lats mask are the same if mask is not None and mask is not False and isinstance(source_geo_def, SwathDefinition): if np.issubsctype(mask.dtype, np.bool): # copy the source area and use it for the rest of the calculations LOG.debug("Copying source area to mask invalid dataset points") if mask.ndim != source_geo_def.lons.ndim: raise ValueError("Can't mask area, mask has different number " "of dimensions.") return SwathDefinition(source_geo_def.lons.where(~mask), source_geo_def.lats.where(~mask)) else: return SwathDefinition(source_geo_def.lons.where(~xu.isnan(mask)), source_geo_def.lats.where(~xu.isnan(mask))) return source_geo_def
def mask_source_lonlats(source_def, mask): """Mask source longitudes and latitudes to match data mask.""" source_geo_def = source_def # the data may have additional masked pixels # let's compare them to see if we can use the same area # assume lons and lats mask are the same if mask is not None and mask is not False and isinstance( source_geo_def, SwathDefinition): if np.issubsctype(mask.dtype, np.bool): # copy the source area and use it for the rest of the calculations LOG.debug("Copying source area to mask invalid dataset points") if mask.ndim != source_geo_def.lons.ndim: raise ValueError("Can't mask area, mask has different number " "of dimensions.") return SwathDefinition(source_geo_def.lons.where(~mask), source_geo_def.lats.where(~mask)) else: return SwathDefinition(source_geo_def.lons.where(~xu.isnan(mask)), source_geo_def.lats.where(~xu.isnan(mask))) return source_geo_def
plt.rcParams.update({'font.size':14}) variable=sys.argv[1] section=sys.argv[2] surf_path='/DataArchive/C3S/subsurf_temp' ds = xr.open_dataset(surf_path+'/Results/'+variable+'_'+section+'_ORCA-0.25x0.25_regular_1979_2018.nc') var = ds[variable].mean(dim='time').squeeze().rename(variable + r' $[^oC]$') colors='YlOrRd' if (section=='A3'): xname='longitude' else: xname='latitude' mask = xu.isnan(var) fig = plt.figure(1, figsize=(15,8)) ax = fig.add_subplot(211) p = var.sel(depth=slice(0,1000)).plot.contourf(ax=ax, cmap=colors,vmin=np.nanmin(var), vmax=np.nanmax(var), extend='both', levels=31, cbar_kwargs={'drawedges': True, 'shrink' : 1.}) mask.sel(depth=slice(0,1000)).plot.contour(ax=ax, levels=1, colors='k') ax.set_xlabel(xname) ax.invert_yaxis() ax2 = fig.add_subplot(212) p2 = var.plot.contourf(ax=ax2, cmap=colors, vmin=np.nanmin(var), vmax=np.nanmax(var), extend='both', levels=31, cbar_kwargs={'drawedges': True, 'shrink' : 1.}) mask.plot.contour(ax=ax2, levels=1, colors='k') ax2.set_xlabel(xname) ax2.invert_yaxis() fig.tight_layout() fig.savefig(surf_path+'/Figures/'+variable+'_'+section+'mean_ORCA-0.25x0.25_regular_1979_2018.png', transparent=True, dpi=300)
if inormalise: #mean zero unit (1) standard deviation logging.info('normalise data ...') datamean = dataxr_filled.mean(dim=('time', 'landpoints')) datastd = dataxr_filled.std(dim=('time', 'landpoints')) dataxr_lost = (dataxr_lost - datamean) / datastd dataxr = (dataxr - datamean) / datastd dataxr_filled = (dataxr_filled - datamean) / datastd invarmean = timeinvariant.mean(dim=('time', 'landpoints')) invarstd = timeinvariant.std(dim=('time', 'landpoints')) timeinvariant = (timeinvariant - invarmean) / invarstd timebeforefill = datetime.now() # obtain lostmask lostmask = xu.isnan( dataxr_lost ) # missing (lost + orig missing + ocean) is True, rest is False # impute data logging.info(f'impute data with method {fill_method} ...') if fill_method in ['mean_impute', 'locf']: pass elif fill_method in [ 'missforestmean', 'missforestlocf', 'ridgemean', 'ridgelocf', 'svdimputemean', 'svdimputelocf', 'gp', 'anchorRegression' ]: # stack var and invar logging.info(f'{fill_method}: stack ...') ntimesteps = dataxr_filled.coords['time'].size timeinvariant = np.repeat(timeinvariant, ntimesteps, axis=1)
def make_climatology(ds, output_frequency, monthly_weights=False, time_var_name='time', time_dim_name='t_dim', fn_out=None, missing_values=False): ''' Calculates a climatology for all variables in a supplied dataset. The resulting xarray dataset will NOT be loaded to RAM. Instead, it is a set of dask operations. To load to RAM use, e.g. .compute(). However, if the original data was large, this may take a long time and a lot of memory. Make sure you have the available RAM or chunking and parallel processes are specified correctly. Otherwise, it is recommended that you access the climatology data in an indexed way. I.E. compute only at specific parts of the data are once. The resulting cliamtology dataset can be written to disk using .to_netcdf(). Again, this may take a while for larger datasets. ds :: xarray dataset object from a COAsT object. output_frequency :: any xarray groupby string. i.e: 'month' 'season' time_var_name :: the string name of the time variable in dataset time_dim_name :: the string name of the time dimension variable in dataset fn_out :: string defining full output netcdf file path and name. missing_values :: boolean where True indicates the data has missing values that should be ignored. Missing values must be represented by NaNs. ''' frequency_str = time_var_name + '.' + output_frequency print('Calculating climatological mean') if missing_values: ds_mean = xr.Dataset() for varname, da in ds.data_vars.items(): mask = xr.where(uf.isnan(da), 0, 1) data = da.groupby(frequency_str).sum(dim=time_dim_name) N = mask.groupby(frequency_str).sum(dim=time_dim_name) ds_mean[varname] = data / N else: if monthly_weights: month_length = ds[time_var_name].dt.days_in_month grouped = month_length.groupby(frequency_str) else: ds['clim_mean_ones_tmp'] = (time_dim_name, np.ones( ds[time_var_name].shape[0])) grouped = ds['clim_mean_ones_tmp'].groupby(frequency_str) weights = grouped / grouped.sum() ds_mean = (ds * weights).groupby(frequency_str).sum(dim=time_dim_name) if not monthly_weights: ds = ds.drop_vars('clim_mean_ones_tmp') if fn_out is not None: print('Saving to file. May take some time..') with ProgressBar(): ds_mean.to_netcdf(fn_out) return ds_mean return
def phenology_optimized(cube): import xarray from xarray.ufuncs import fabs,isnan import numpy class Phenology: """ sStartDate: First date of the interval for getting season start sEndDate: Last date of the interval for getting season start mStartDate: First date of the interval for getting maximum greenness mEndDate: Last date of the interval for getting maximum greenness eStartDate: First date of the interval for getting season end eEndDate: Last date of the interval for getting season end tSos: The offset (%) to add to the start date minimum to set the start of the season tEos: The offset (%) to subtract from the end date minimum to set the end of the season """ def __init__(self,year,tdim,taxis): self.year= year # year of the season, int self.tdim = tdim # the name of the time dimension (string) self.taxis = taxis # the index of the time dimension (int) self.sStart=numpy.datetime64(str(year)+'-04-02') # Start date of interval for start of season self.sEnd= numpy.datetime64(str(year)+'-06-10') # End date of tart interval for start of season self.mStart=numpy.datetime64(str(year)+'-06-10') # Start date of interval for mid of season self.mEnd= numpy.datetime64(str(year)+'-09-01') # End date of tart interval for mid of season self.eStart=numpy.datetime64(str(year)+'-09-01') # Start date of interval for end of season self.eEnd= numpy.datetime64(str(year)+'-12-31') # End date of tart interval for end of season self.tSos= 10. # Threshold for start of season self.tEos= 10. # Threshold for end of season """ Calculate the maximum greenness in the mid-season as reference """ def getLocalMax(self,array): # Get the local maximum greenness seasonMid_Range=array.sel(t=slice(self.mStart,self.mEnd)) seasonMid_MaxGreennessIdx=seasonMid_Range.argmax('t') #seasonMid_DateAtMax=seasonMid_Range.t[seasonMid_MaxGreennessIdx].dt.dayofyear seasonMid_MaxGreenness=seasonMid_Range.isel(t=seasonMid_MaxGreennessIdx) return seasonMid_MaxGreenness """ Calculate the start of the season based on selected interval [start, end] and a greenness curve (df). Within this interval we will first look for the local minimum greenness, marked by (dsMin, ysMin). In the second step we will use the offset (%) to calculate the amount greenness offset that needs to be applied to the minumum value in order to get the start of the season. This offset is calculated as a percentage of the difference between the maximum greenness and the local minimum. """ def getStartOfSeason(self,array, sMmaxgreen): # compute the minimum in the season start seasonStart_Range=array.sel(t=slice(self.sStart,self.sEnd)) seasonStart_MinGreennessIdx=seasonStart_Range.argmin('t') seasonStart_DateAtMin,seasonStart_MinGreenness=seasonStart_Range.t[seasonStart_MinGreennessIdx].dt.dayofyear,seasonStart_Range.isel(t=seasonStart_MinGreennessIdx) # Calculate the greenness value corresponding to the start of the season seasonStart_Greenness = seasonStart_MinGreenness + ((sMmaxgreen - seasonStart_MinGreenness) * (self.tSos / 100.0)) # Get the closest date to this greenness #for i in range(len(seasonStart_Range[:])): seasonStart_Range[i]=seasonStart_Range[i]-seasonStart_Greenness seasonStart_Range=fabs(seasonStart_Range-seasonStart_Greenness) seasonStart_Idx=seasonStart_Range.where(seasonStart_Range.t.dt.dayofyear>=seasonStart_DateAtMin).argmin('t',skipna=True) seasonStart_Date=seasonStart_Range.t[seasonStart_Idx].dt.dayofyear return seasonStart_Date """ Calculate the end of the season based on selected interval [start, end] and a greenness curve (df). Within this interval we will first look for the local minimum greenness, marked by (deMin, yeMin). In the second step we will use the offset (%) to calculate the amount greenness offset that needs to be applied to the minumum value in order to get the start of the season. This offset is calculated as a percentage of the difference between the maximum greenness and the local minimum. """ def getEndOfSeason(self, array, sMmaxgreen): # compute the minimum in the season start seasonEnd_Range=array.sel(t=slice(self.eStart,self.eEnd)) seasonEnd_MinGreennessIdx=seasonEnd_Range.argmin('t') seasonEnd_DateAtMin,seasonEnd_MinGreenness=seasonEnd_Range.t[seasonEnd_MinGreennessIdx].dt.dayofyear,seasonEnd_Range.isel(t=seasonEnd_MinGreennessIdx) # Calculate the greenness value corresponding to the start of the season seasonEnd_Greenness = seasonEnd_MinGreenness + ((sMmaxgreen - seasonEnd_MinGreenness) * (self.tEos / 100.0)) # Get the closest date to this greenness #for i in range(len(seasonEnd_Range[:])): seasonEnd_Range[i]=seasonEnd_Range[i]-seasonEnd_Greenness seasonEnd_Range=fabs(seasonEnd_Range-seasonEnd_Greenness) seasonEnd_Idx=seasonEnd_Range.where(seasonEnd_Range.t.dt.dayofyear<=seasonEnd_DateAtMin).argmin('t',skipna=True) seasonEnd_Date=seasonEnd_Range.t[seasonEnd_Idx].dt.dayofyear return seasonEnd_Date # get the xarray, selecting band zero if multiple bands present # also building bands and t removed metadata array=cube.get_array() origdims=list(array.dims) array=array.isel(bands=range(0,1)).squeeze('bands',drop=True) dims=list(array.dims) dims.remove('t') coords=dict(array.coords) coords.pop('t') # guard agains missing data (Nan's) # input data should already be smoothed and interpolated missingmask=isnan(array).any('t') array=array.fillna(0.) # run phenology bundle year=int((array.t.min()+(array.t.values.max()-array.t.min())/2).dt.year)#int(array.t.dt.year[0]) pp=Phenology(year,'t',array.dims.index('t')) seasonMid_MaxGreenness=pp.getLocalMax(array) seasonStart_Date=pp.getStartOfSeason(array, seasonMid_MaxGreenness) seasonEnd_Date=pp.getEndOfSeason(array, seasonMid_MaxGreenness) # combine results seasonStart_Date=xarray.DataArray(seasonStart_Date,dims=dims,coords=coords) seasonEnd_Date=xarray.DataArray(seasonEnd_Date,dims=dims,coords=coords) season=xarray\ .concat([seasonStart_Date,seasonEnd_Date],dim='bands')\ .expand_dims('t',0)\ .assign_coords(bands=['sos','eos'],t=[numpy.datetime64(str(pp.year)+"-01-01")])\ .astype(numpy.int16) # set missing data to 0, exploiting that at this point t,bands are the first two coordinates season=season.where(~missingmask,numpy.int16(0)) # set the original order of dimensions season=season.transpose(*origdims) return DataCube(season)
import xarray.ufuncs as xu from interpolation import gapfill_interpolation, remove_ocean_points from feature_engineering import create_precip_binary, logscale_precip, create_lat_lon_features, create_time_feature, creade_embedded_features, stack_constant_maps normalise, stack from clusterings import kmeans_clustering from regression_learning import Imputation from sklearn.ensemble import RandomForestRegressor from postproc import exp_precip, renormalise, unstack # load data data = xr.open_dataset('/path/to/gappy/dataset') constant_maps = xr.open_dataset('/path/to/constant/maps') # create mask of missing values mask = xu.isnan(data) # get list of variables variables = data.coords['variables'].values # step 1: interpolation data = gapfill_interpolation(data) # optional: save interpolation result for comparison # optional: remove ocean points for reducing file size landmask = xr.open_dataset('/path/to/landmask') # needs dims 'latitude' and 'longitude' data = remove_ocean_points(data, landmask) mask = remove_ocean_points(mask, landmask) # step 2: feature engineering
def zero_missing_data(data1, data2): """Replace NaN values with zeros in data1 if the data is valid in data2.""" nans = xu.logical_and(xu.isnan(data1), xu.logical_not(xu.isnan(data2))) return data1.where(~nans, 0)
landmask: boolean xarray dataarray with only the coordinates latitude and longitude, where grid points on land are True and grid points in the ocean (or regions that are not relevant for research) are False Returns ---------- imputed_data: data of the same shape as input data, where all values that were not missing are still the same and all values that were originally missing are imputed via spatiotemporal mean """ landlat, landlon = np.where(landmask) return data.isel(longitude=xr.DataArray(landlon, dims='landpoints'), latitude=xr.DataArray(landlat, dims='landpoints')) if __name__ == '__main__': data = xr.open_dataset('/path/to/gappy/dataset') log_fracmis(data, 'after reading file') mask = xu.isnan(data) # create mask of missing values data = gapfill_interpolation( data) # initial gapfill all missing values with interpolation log_fracmis(data, 'after interpolation') # should be zero # optional: remove ocean points for reducing file size landmask = xr.open_dataset( '/path/to/landmask') # needs dims 'latitude' and 'longitude' data = remove_ocean_points(data, landmask) mask = remove_ocean_points(mask, landmask) # data.to_netcdf ... # mask.to_netcdf ...
for pp in range(0, n_port): tg_tmp = tg.isel(port=pp) a0, g0, af, gf = do_analysis(tg_tmp.time, tg_tmp.ssh, const, tg_tmp.latitude.values) a_10y[pp, 0] = a0 g_10y[pp, 0] = g0 a_10y[pp, 1] = af g_10y[pp, 1] = gf for dd in range(0, n_1m - 1): tg_m = tg_tmp.sel(time=slice(dates_1m[dd], dates_1m[dd + 1])) if len(np.where(uf.isnan(tg_m.ssh))[0]) > tg_m.dims['time'] / 5: continue a0, g0, af, gf = do_analysis(tg_m.time, tg_m.ssh, const, tg_m.latitude.values) a_1m[pp, 0, dd] = a0 g_1m[pp, 0, dd] = g0 a_1m[pp, 1, dd] = af g_1m[pp, 1, dd] = gf for dd in range(0, n_3m - 1): tg_m = tg_tmp.sel(time=slice(dates_3m[dd], dates_3m[dd + 1])) if len(np.where(uf.isnan(tg_m.ssh))[0]) > tg_m.dims['time'] / 5: continue a0, g0, af, gf = do_analysis(tg_m.time, tg_m.ssh, const,