def __init__(self, path="/RECH/skynet1_rech3/huziy/cru_data/CRUTS3.1/cru_ts_3_10.1901.2009.tmp.dat.nc", var_name="tmp", lazy=False): self.times = None self.var_data = None self.times_var = None self.kdtree = None self.times_num = None self.lons2d, self.lats2d = None, None self.lazy = lazy self.var_name = var_name try: with Dataset(path) as ds: self._init_fields(ds) # Cannot go into with, since it needs to be open self.nc_dataset = Dataset(path) except OSError as oserr: with MFDataset(path) as ds: self._init_fields(ds) # Cannot go into with, since it needs to be open self.nc_dataset = MFDataset(path) self.nc_vars = ds.variables
def ice_comp_model_to_osi(pathToModel, modelYear, modelIteration, boundLat, pathToOSI, param = 'area', threshold=0.15): ''' Plot sea ice area from satellite data and several model iterations ''' fsat = MFDataset(pathToOSI+'/OSI'+modelYear+'??.nc') osi_lat = fsat.variables['lat'][:] osi_lon = fsat.variables['lon'][:] osi_ice = fsat.variables['ice_conc'][0,:,:] area_osi = np.ones(osi_ice.shape)*100 osi_area = [] for mm in range(12): if param == 'area': osi_area.append(calc_area(fsat.variables['ice_conc'][mm,:,:]/100,\ area_osi, osi_lat, blat=boundLat, threshold=threshold)/1e6) elif param == 'extent': osi_area.append(calc_extent(fsat.variables['ice_conc'][mm,:,:]/100,\ area_osi, osi_lat, blat=boundLat, threshold=threshold)/1e6) g = Dataset('./grid.cdf') dxc = g.variables['dxc'][0,:,:] dyc = g.variables['dyc'][0,:,:] lat = g.variables['yc'][0,:,:] dxcXdyc = dxc*dyc area_model=np.zeros((len(modelIteration), 12)) for (it, iteration) in enumerate(modelIteration): fm = MFDataset(pathToModel+'/'+modelYear+'/'+'it'+str(iteration)+'/fw/*.cdf') for mm in range(12): if param == 'area': area_model[it,mm] = calc_area(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat,threshold=threshold)/10e11 elif param == 'extent': area_model[it,mm] = calc_extent(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat,threshold=threshold)/10e11 fm.close() dates = pd.date_range(modelYear+'-01', str(int(modelYear)+1)+'-01', freq='M') dd = pd.DataFrame(index=dates) dd['Satellite']=osi_area for (it , iteration) in enumerate(modelIteration): dd['it'+str(iteration)]=area_model[it,:] return dd.plot(figsize=(10,5), lw = 3)
def ice_comp_model_to_osi_table(pathToModel, modelYears, modelIteration, boundLat, pathToOSI, param = 'area', threshold=0.15): diff_array = numpy.zeros((len(modelYears), 12)) for (nnum, yyear) in enumerate(modelYears): fsat = MFDataset(pathToOSI+'/OSI'+yyear+'??.nc') osi_lat = fsat.variables['lat'][:] osi_lon = fsat.variables['lon'][:] osi_ice = fsat.variables['ice_conc'][0,:,:] area_osi = np.ones(osi_ice.shape)*100 osi_area = [] for mm in range(12): if param == 'area': osi_area.append(calc_area(fsat.variables['ice_conc'][mm,:,:]/100,\ area_osi, osi_lat, blat=boundLat, threshold=threshold)/1e6) elif param == 'extent': osi_area.append(calc_extent(fsat.variables['ice_conc'][mm,:,:]/100,\ area_osi, osi_lat, blat=boundLat, threshold=threshold)/1e6) g = Dataset('./grid.cdf') dxc = g.variables['dxc'][0,:,:] dyc = g.variables['dyc'][0,:,:] lat = g.variables['yc'][0,:,:] dxcXdyc = dxc*dyc area_model=np.zeros((len(modelIteration), 12)) if modelIteration[0] == 'last': gg = glob.glob(pathToModel+'/'+yyear+'/'+'it*') gg.sort() lastit = [int(gg[-1].split('/')[-1].split('t')[-1])] else: lastit = modelIteration for (it, iteration) in enumerate(lastit): fm = MFDataset(pathToModel+'/'+yyear+'/'+'it'+str(iteration)+'/fw/*.cdf') for mm in range(12): if param == 'area': area_model[it,mm] = calc_area(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat)/10e11 elif param == 'extent': area_model[it,mm] = calc_extent(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat)/10e11 fm.close() diff_array[nnum,:] = area_model[0,:]-osi_area[:] return diff_array
def get_climatologic_field(self, varname = "mrro", gcm = "", rcm = "", start_year = None, end_year = None, months = None ): """ for time t: start_year <= t <= end_year """ mfds = MFDataset("{0}/{1}-{2}/current/{3}_*.nc".format(self.folder_with_nc_data, gcm, rcm, varname)) self.lon2d = mfds.variables[self.lon_name][:].transpose() self.lat2d = mfds.variables[self.lat_name][:].transpose() self._init_kd_tree() cache_file = self._get_clim_cache_file_path(varname = varname, gcm=gcm, rcm = rcm, start_year=start_year, end_year=end_year, months=months) cache_file = os.path.join(self.cache_files_folder, cache_file) if os.path.isfile(cache_file): f = open(cache_file) mfds.close() return pickle.load(f) t = mfds.variables["time"] t_units = t.units t_calendar = t.calendar t_start = date2num(datetime(start_year, 1,1), t_units, calendar=t_calendar) t_end = date2num(datetime(end_year+1, 1,1), t_units, calendar=t_calendar) t = t[:] t_sel = t[(t_start <= t) & (t < t_end)] dates_sel = num2date(t_sel, t_units, calendar=t_calendar) bool_vect = np.array( [x.month in months for x in dates_sel], dtype=np.bool ) data_sel = mfds.variables[varname][ np.where( (t_start <= t) & (t < t_end) )[0],:,:] #save results to a cache file for reuse result = data_sel[bool_vect,:,:].mean(axis = 0).transpose() pickle.dump(result, open(cache_file,"w")) mfds.close() return result #because in the file the axes are inversed
def getCruData(month, dimLon, dimLat, cruPath): from netCDF4 import MFDataset import numpy as np #dimLon = np.array((-30.25, 50.25)) #dimLat = np.array((30.25, 70.25)) # account python 0 indexing month = int(month) - 1 # open netCDF file nc = MFDataset(cruPath) # map lat lon data to file cru ncdf indexing schema lonStart = np.where(nc.variables['lon'][:] == snapToGrid(dimLon[0]))[0][0]+1 lonEnd = np.where(nc.variables['lon'][:] == snapToGrid(dimLon[1]))[0][0]-1 lonDim = lonEnd - lonStart latStart = np.where(nc.variables['lat'][:] == snapToGrid(dimLat[0]))[0][0]+1 latEnd = np.where(nc.variables['lat'][:] == snapToGrid(dimLat[1]))[0][0]-1 latDim = latEnd - latStart # get number of available years for every month yearsAvailable = len(nc.dimensions['time']) / 12 # predefine month array like 100 81 161 monthData = np.empty((yearsAvailable, latDim, lonDim)) monthData[:] = np.nan for i in range(0, yearsAvailable): # read data from file for given month monthData[i,:,:] = nc.variables['tmp'][month,latStart:latEnd,lonStart:lonEnd] month = month + 12 # to stay consitent with old array schema monthData = monthData.T # get vector of lat lon data longitude = nc.variables['lon'][lonStart:lonEnd] latitude = nc.variables['lat'][latStart:latEnd] nc.close() # set na values monthData[monthData > 100] = np.nan return monthData, longitude, latitude
class EcoFOCI_mfnetCDF(object): def __init__(self, file_name=None, aggdim=None): """Initialize opening of multiple netcdf files along same dimension (aggdim) in same path. Parameters ---------- file_name : str full path to file on disk (with wildcards) aggdim : str dimesion name to aggregate along. Slowest varying dimension or unlimited dimension will be choosen if no option is passed. """ self.nchandle = MFDataset(file_name,'a',aggdim=aggdim) self.file_name = file_name def get_global_atts(self): g_atts = {} att_names = self.nchandle.ncattrs() for name in att_names: g_atts[name] = self.nchandle.getncattr(name) return g_atts def get_vars(self): self.variables = self.nchandle.variables return self.nchandle.variables def ncreadfile_dic(self): data = {} for j, v in enumerate(self.nchandle.variables): if v in self.nchandle.variables.keys(): #check for nc variable data[v] = self.nchandle.variables[v][:] else: #if parameter doesn't exist fill the array with zeros data[v] = None return (data) def close(self): self.nchandle.close()
def runTest(self): """testing multi-file dataset access""" f = MFDataset(self.files,check=True) assert f.history == 'created today' assert_array_equal(np.arange(0,nx),f.variables['x'][:]) varin = f.variables['data'] datin = varin[:] assert_array_equal(datin.mask,data.mask) varin.set_auto_maskandscale(False) data2 = data.filled() assert varin.long_name == 'phony data' assert len(varin) == nx assert varin.shape == (nx,ydim,zdim) assert varin.dimensions == ('x','y','z') assert_array_equal(varin[4:-4:4,3:5,2:8],data2[4:-4:4,3:5,2:8]) assert varin[0,0,0] == data2[0,0,0] assert_array_equal(varin[:],data2) assert getattr(varin,'nonexistantatt',None) == None f.close()
def get_timerange(resource): """ returns from/to timestamp of given netcdf file(s). :param resource: list of path(s) to netCDF file(s) :returns netcdf.datetime.datetime: start, end """ start = end = None if type(resource) != list: resource = [resource] print resource try: if len(resource) > 1: ds = MFDataset(resource) time = ds.variables["time"] else: ds = Dataset(resource[0]) time = ds.variables["time"] if (hasattr(time, "units") and hasattr(time, "calendar")) == True: s = num2date(time[0], time.units, time.calendar) e = num2date(time[-1], time.units, time.calendar) elif hasattr(time, "units"): s = num2date(time[0], time.units) e = num2date(time[-1], time.units) else: s = num2date(time[0]) e = num2date(time[-1]) ##TODO: include frequency start = "%s%s%s" % (s.year, str(s.month).zfill(2), str(s.day).zfill(2)) end = "%s%s%s" % (e.year, str(e.month).zfill(2), str(e.day).zfill(2)) ds.close() except Exception as e: msg = "failed to get time range: %s " % e logger.exception(msg) raise Exception(msg) return start, end
def get_time(resource, format=None): """ returns all timestamps of given netcdf file as datetime list. :param resource: NetCDF file(s) :param format: if a format is provided (e.g format='%Y%d%m'), values will be converted to string :return : list of timesteps """ if type(resource) != list: resource = [resource] try: if len(resource) > 1: ds = MFDataset(resource) time = ds.variables["time"] else: ds = Dataset(resource[0]) time = ds.variables["time"] except: msg = "failed to get time" logger.exception(msg) raise Exception(msg) try: if (hasattr(time, "units") and hasattr(time, "calendar")) == True: timestamps = num2date(time[:], time.units, time.calendar) elif hasattr(time, "units"): timestamps = num2date(time[:], time.units) else: timestamps = num2date(time[:]) ds.close() try: if format != None: timestamps = [t.strftime(format=format) for t in timestamps] except: msg = "failed to convert times to string" print msg logger.debug(msg) except: msg = "failed to convert time" logger.exception(msg) raise Exception(msg) return timestamps
def getCruData(NCFILE, month, dimLon, dimLat): from netCDF4 import MFDataset import numpy as np # account python 0 indexing month = month - 1 # open netCDF file nc = MFDataset(NCFILE) # map lat lon data to file cru ncdf indexing schema lonStart = np.where(nc.variables['lon'][:] == dimLon[0])[0][0] lonEnd = np.where(nc.variables['lon'][:] == dimLon[1])[0][0] lonDim = (lonEnd - lonStart) latStart = np.where(nc.variables['lat'][:] == dimLat[0])[0][0] latEnd = np.where(nc.variables['lat'][:] == dimLat[1])[0][0] latDim = (latEnd - latStart) # get number of available years for every month yearsAvailable = len(nc.dimensions['time']) / 12 # predefine month array like 100 81 161 monthData = np.empty((yearsAvailable, latDim, lonDim)) monthData[:] = np.nan for i in range(0, yearsAvailable): # read data from file for given month monthData[i,:,:] = nc.variables['tmp'][month,latStart:latEnd,lonStart:lonEnd] month = month + 12 # set na value monthData[monthData > 100] = np.nan # get vector of lat lon data longitude = nc.variables['lon'][lonStart:lonEnd] latitude = nc.variables['lat'][latStart:latEnd] nc.close() return monthData, longitude, latitude
def ice_comp_model_to_sat_table_rm(pathToModel, modelYears, modelIteration,\ boundLat, pathToOSI, param): diff_array = numpy.zeros((len(modelYears), 12)) for (nnum, yyear) in enumerate(modelYears): g = Dataset('./grid.cdf') dxc = g.variables['dxc'][0,:,:] dyc = g.variables['dyc'][0,:,:] lat = g.variables['yc'][0,:,:] topo = g.variables['topo'][0,:,:] dxcXdyc = dxc*dyc #area_model=np.zeros((len(modelIteration), 12)) if modelIteration[0] == 'last': gg = glob.glob(pathToModel+'/'+yyear+'/'+'it*') gg.sort() lastit = [int(gg[-1].split('/')[-1].split('t')[-1])] else: lastit = modelIteration for (it, iteration) in enumerate(lastit): fm = MFDataset(pathToModel+'/'+yyear+'/'+'it'+str(iteration)+'/fw/*.cdf') fsat = MFDataset(pathToOSI+yyear+'??.nc') for mm in range(12): if param == 'area': aa_model = np.ma.filled(fm.variables['area'][mm,:,:], 0) * dxcXdyc bb_satel = (fsat.variables['ice'][mm,:,:]) * dxcXdyc cc_diff = aa_model - bb_satel diff_array[nnum,mm] = np.sqrt(cc_diff**2).sum() if param == 'extent': dmodel = np.ma.filled(fm.variables['area'][mm,:,:], 0) dmodel[dmodel<0.15] = 0 dmodel[dmodel>=0.15] = 1 aa_model = dmodel * dxcXdyc dsat = fsat.variables['ice'][mm,:,:] dsat[dsat<0.15] = 0 dsat[dsat>=0.15] = 1 bb_satel = dsat * dxcXdyc cc_diff = aa_model - bb_satel diff_array[nnum,mm] = np.sqrt(cc_diff**2).sum() fm.close() fsat.close() return diff_array
def __init__(self, file_name=None, aggdim=None): """Initialize opening of multiple netcdf files along same dimension (aggdim) in same path. Parameters ---------- file_name : str full path to file on disk (with wildcards) aggdim : str dimesion name to aggregate along. Slowest varying dimension or unlimited dimension will be choosen if no option is passed. """ self.nchandle = MFDataset(file_name,'a',aggdim=aggdim) self.file_name = file_name
def ncinfo(files, hidedims, ignoretime, units, vars=None): if isinstance(files, list): try: ncobj = MFDataset(files) except Exception as e: warn("Could not aggregate datasets, python library returned: " + str(e)) return else: print() print(files) ncobj = Dataset(files, 'r') varnames = ncobj.variables.keys() varname_maxlen = len(max(varnames, key=len)) pr_varnames = [] pr_dimensions = [] pr_longnames = [] for varname in varnames: var = ncobj.variables[varname] if "time" == varname.lower(): if not var.ndim == 1: warn("I don't understand two dimensional time dimensions") continue # Get our time axis nsteps = len(var) try: unit = var.__getattribute__("units").partition(' ')[0] except AttributeError: unit = 'None' if nsteps > 1: print("Time steps: ", nsteps, " x ", var[1] - var[0], unit) elif nsteps == 1: print("Time : ", var[0], unit) continue if ignoretime and "time" in varname.lower(): continue if vars is not None: if varname not in vars: continue if var.ndim == 1: dims = ncobj.variables[varname].dimensions if hidedims and dims[0] == varname: # This is a dimension variable, ignore continue if ignoretime and dims[0] == "time": # Time bounds stuff also ignore continue # fmt = '{0:{1}} :: {2:<22} :: {3}' try: long_name = var.__getattribute__("long_name") except AttributeError: long_name = '' if units: try: unit = "(" + var.__getattribute__("units") + ")" except AttributeError: unit = '' long_name = " ".join([long_name, unit]) pr_varnames.append(str(varname)) pr_dimensions.append(str(var.shape)) pr_longnames.append(str(long_name)) fmt = '{0:{1}} :: {2:{3}} :: {4}' pr_varnames_maxlen = len(max(pr_varnames, key=len)) pr_dimensions_maxlen = len(max(pr_dimensions, key=len)) for varstr, dimstr, namestr in zip(pr_varnames, pr_dimensions, pr_longnames): print( fmt.format(varstr, pr_varnames_maxlen, dimstr, pr_dimensions_maxlen, namestr))
def runTest(self): """testing multi-file dataset access""" f = MFDataset(self.files,check=True) f.set_auto_maskandscale(True) # issue570 f.set_always_mask(False) assert f.history == 'created today' assert_array_equal(np.arange(0,nx),f.variables['x'][:]) varin = f.variables['data'] datin = varin[:] assert_array_equal(datin.mask,data.mask) varin.set_auto_maskandscale(False) data2 = data.filled() assert varin.long_name == 'phony data' assert len(varin) == nx assert varin.shape == (nx,ydim,zdim) assert varin.dimensions == ('x','y','z') assert_array_equal(varin[4:-4:4,3:5,2:8],data2[4:-4:4,3:5,2:8]) assert varin[0,0,0] == data2[0,0,0] assert_array_equal(varin[:],data2) assert getattr(varin,'nonexistantatt',None) == None f.close() # test master_file kwarg (issue #835). f = MFDataset(self.files,master_file=self.files[-1],check=True) assert_array_equal(np.arange(0,nx),f.variables['x'][:]) varin = f.variables['data'] assert_array_equal(varin[4:-4:4,3:5,2:8],data2[4:-4:4,3:5,2:8]) f.close() # testing multi-file get_variables_by_attributes f = MFDataset(self.files,check=True) assert f.get_variables_by_attributes(axis='T') == [] f.get_variables_by_attributes(units='zlotys')[0] == f['x'] f.close()
def __init__(self, files, thkth=1.0, **kwargs): MFDataset.__init__(self, files, **kwargs) self.__dict__['thkth'] = thkth
def test_get_by_mfdataset(self): """testing multi-file get_variables_by_attributes.""" f = MFDataset(self.files,check=True) assert f.get_variables_by_attributes(axis='T') == [] f.get_variables_by_attributes(units='zlotys')[0] == f['x'] f.close()
def test_tutorial(): # 2 unlimited dimensions. #temp = rootgrp.createVariable('temp','f4',('time','level','lat','lon',)) # this makes the compression 'lossy' (preserving a precision of 1/1000) # try it and see how much smaller the file gets. temp = rootgrp.createVariable('temp', 'f4', ( 'time', 'level', 'lat', 'lon', ), least_significant_digit=3) # attributes. import time rootgrp.description = 'bogus example script' rootgrp.history = 'Created ' + time.ctime(time.time()) rootgrp.source = 'netCDF4 python module tutorial' latitudes.units = 'degrees north' longitudes.units = 'degrees east' levels.units = 'hPa' temp.units = 'K' times.units = 'hours since 0001-01-01 00:00:00.0' times.calendar = 'gregorian' for name in rootgrp.ncattrs(): print('Global attr', name, '=', getattr(rootgrp, name)) print(rootgrp) print(rootgrp.__dict__) print(rootgrp.variables) print(rootgrp.variables['temp']) import numpy # no unlimited dimension, just assign to slice. lats = numpy.arange(-90, 91, 2.5) lons = numpy.arange(-180, 180, 2.5) latitudes[:] = lats longitudes[:] = lons print('latitudes =\n', latitudes[:]) print('longitudes =\n', longitudes[:]) # append along two unlimited dimensions by assigning to slice. nlats = len(rootgrp.dimensions['lat']) nlons = len(rootgrp.dimensions['lon']) print('temp shape before adding data = ', temp.shape) from numpy.random.mtrand import uniform # random number generator. temp[0:5, 0:10, :, :] = uniform(size=(5, 10, nlats, nlons)) print('temp shape after adding data = ', temp.shape) # levels have grown, but no values yet assigned. print('levels shape after adding pressure data = ', levels.shape) # assign values to levels dimension variable. levels[:] = [1000., 850., 700., 500., 300., 250., 200., 150., 100., 50.] # fancy slicing tempdat = temp[::2, [1, 3, 6], lats > 0, lons > 0] print('shape of fancy temp slice = ', tempdat.shape) print(temp[0, 0, [0, 1, 2, 3], [0, 1, 2, 3]].shape) # fill in times. from datetime import datetime, timedelta from netCDF4 import num2date, date2num, date2index dates = [ datetime(2001, 3, 1) + n * timedelta(hours=12) for n in range(temp.shape[0]) ] times[:] = date2num(dates, units=times.units, calendar=times.calendar) print('time values (in units %s): ' % times.units + '\\n', times[:]) dates = num2date(times[:], units=times.units, calendar=times.calendar) print('dates corresponding to time values:\\n', dates) rootgrp.close() # create a series of netCDF files with a variable sharing # the same unlimited dimension. for nfile in range(10): f = Dataset('mftest' + repr(nfile) + '.nc', 'w', format='NETCDF4_CLASSIC') f.createDimension('x', None) x = f.createVariable('x', 'i', ('x', )) x[0:10] = numpy.arange(nfile * 10, 10 * (nfile + 1)) f.close() # now read all those files in at once, in one Dataset. from netCDF4 import MFDataset f = MFDataset('mftest*nc') print(f.variables['x'][:]) # example showing how to save numpy complex arrays using compound types. f = Dataset('complex.nc', 'w') size = 3 # length of 1-d complex array # create sample complex data. datac = numpy.exp(1j * (1. + numpy.linspace(0, numpy.pi, size))) print(datac.dtype) # create complex128 compound data type. complex128 = numpy.dtype([('real', numpy.float64), ('imag', numpy.float64)]) complex128_t = f.createCompoundType(complex128, 'complex128') # create a variable with this data type, write some data to it. f.createDimension('x_dim', None) v = f.createVariable('cmplx_var', complex128_t, 'x_dim') data = numpy.empty(size, complex128) # numpy structured array data['real'] = datac.real data['imag'] = datac.imag v[:] = data # close and reopen the file, check the contents. f.close() f = Dataset('complex.nc') print(f) print(f.variables['cmplx_var']) print(f.cmptypes) print(f.cmptypes['complex128']) v = f.variables['cmplx_var'] print(v.shape) datain = v[:] # read in all the data into a numpy structured array # create an empty numpy complex array datac2 = numpy.empty(datain.shape, numpy.complex128) # .. fill it with contents of structured array. datac2.real = datain['real'] datac2.imag = datain['imag'] print(datac.dtype, datac) print(datac2.dtype, datac2) # more complex compound type example. from netCDF4 import chartostring, stringtoarr f = Dataset('compound_example.nc', 'w') # create a new dataset. # create an unlimited dimension call 'station' f.createDimension('station', None) # define a compound data type (can contain arrays, or nested compound types). NUMCHARS = 80 # number of characters to use in fixed-length strings. winddtype = numpy.dtype([('speed', 'f4'), ('direction', 'i4')]) statdtype = numpy.dtype([('latitude', 'f4'), ('longitude', 'f4'), ('surface_wind', winddtype), ('temp_sounding', 'f4', 10), ('press_sounding', 'i4', 10), ('location_name', 'S1', NUMCHARS)]) # use this data type definitions to create a compound data types # called using the createCompoundType Dataset method. # create a compound type for vector wind which will be nested inside # the station data type. This must be done first! wind_data_t = f.createCompoundType(winddtype, 'wind_data') # now that wind_data_t is defined, create the station data type. station_data_t = f.createCompoundType(statdtype, 'station_data') # create nested compound data types to hold the units variable attribute. winddtype_units = numpy.dtype([('speed', 'S1', NUMCHARS), ('direction', 'S1', NUMCHARS)]) statdtype_units = numpy.dtype([('latitude', 'S1', NUMCHARS), ('longitude', 'S1', NUMCHARS), ('surface_wind', winddtype_units), ('temp_sounding', 'S1', NUMCHARS), ('location_name', 'S1', NUMCHARS), ('press_sounding', 'S1', NUMCHARS)]) # create the wind_data_units type first, since it will nested inside # the station_data_units data type. wind_data_units_t = f.createCompoundType(winddtype_units, 'wind_data_units') station_data_units_t =\ f.createCompoundType(statdtype_units,'station_data_units') # create a variable of of type 'station_data_t' statdat = f.createVariable('station_obs', station_data_t, ('station', )) # create a numpy structured array, assign data to it. data = numpy.empty(1, station_data_t) data['latitude'] = 40. data['longitude'] = -105. data['surface_wind']['speed'] = 12.5 data['surface_wind']['direction'] = 270 data['temp_sounding'] = (280.3, 272., 270., 269., 266., 258., 254.1, 250., 245.5, 240.) data['press_sounding'] = range(800, 300, -50) # variable-length string datatypes are not supported inside compound types, so # to store strings in a compound data type, each string must be # stored as fixed-size (in this case 80) array of characters. data['location_name'] = stringtoarr('Boulder, Colorado, USA', NUMCHARS) # assign structured array to variable slice. statdat[0] = data # or just assign a tuple of values to variable slice # (will automatically be converted to a structured array). statdat[1] = (40.78, -73.99, (-12.5, 90), (290.2, 282.5, 279., 277.9, 276., 266., 264.1, 260., 255.5, 243.), range(900, 400, -50), stringtoarr('New York, New York, USA', NUMCHARS)) print(f.cmptypes) windunits = numpy.empty(1, winddtype_units) stationobs_units = numpy.empty(1, statdtype_units) windunits['speed'] = stringtoarr('m/s', NUMCHARS) windunits['direction'] = stringtoarr('degrees', NUMCHARS) stationobs_units['latitude'] = stringtoarr('degrees north', NUMCHARS) stationobs_units['longitude'] = stringtoarr('degrees west', NUMCHARS) stationobs_units['surface_wind'] = windunits stationobs_units['location_name'] = stringtoarr('None', NUMCHARS) stationobs_units['temp_sounding'] = stringtoarr('Kelvin', NUMCHARS) stationobs_units['press_sounding'] = stringtoarr('hPa', NUMCHARS) statdat.units = stationobs_units # close and reopen the file. f.close() f = Dataset('compound_example.nc') print(f) statdat = f.variables['station_obs'] print(statdat) # print out data in variable. print('data in a variable of compound type:') print('----') for data in statdat[:]: for name in statdat.dtype.names: if data[name].dtype.kind == 'S': # a string # convert array of characters back to a string for display. units = chartostring(statdat.units[name]) print(name,': value =',chartostring(data[name]),\ ': units=',units) elif data[name].dtype.kind == 'V': # a nested compound type units_list = [ chartostring(s) for s in tuple(statdat.units[name]) ] print(name,data[name].dtype.names,': value=',data[name],': units=',\ units_list) else: # a numeric type. units = chartostring(statdat.units[name]) print(name, ': value=', data[name], ': units=', units) print('----') f.close() f = Dataset('tst_vlen.nc', 'w') vlen_t = f.createVLType(numpy.int32, 'phony_vlen') x = f.createDimension('x', 3) y = f.createDimension('y', 4) vlvar = f.createVariable('phony_vlen_var', vlen_t, ('y', 'x')) import random data = numpy.empty(len(y) * len(x), object) for n in range(len(y) * len(x)): data[n] = numpy.arange(random.randint(1, 10), dtype='int32') + 1 data = numpy.reshape(data, (len(y), len(x))) vlvar[:] = data print(vlvar) print('vlen variable =\n', vlvar[:]) print(f) print(f.variables['phony_vlen_var']) print(f.vltypes['phony_vlen']) z = f.createDimension('z', 10) strvar = f.createVariable('strvar', str, 'z') chars = '1234567890aabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' data = numpy.empty(10, object) for n in range(10): stringlen = random.randint(2, 12) data[n] = ''.join([random.choice(chars) for i in range(stringlen)]) strvar[:] = data print('variable-length string variable:\n', strvar[:]) print(f) print(f.variables['strvar']) f.close()
def readEnsemble(wrfinit, timerange=None, fields=None, debug=False): ''' Reads in desired fields and returns 2-D arrays of data for each field (barb/contour/field) ''' if debug: print fields datadict = {} file_list, missing_list = makeEnsembleList(wrfinit, timerange) #construct list of files # loop through fill field, contour field, barb field and retrieve required data for f in ['fill', 'contour', 'barb']: if not fields[f].keys(): continue if debug: print 'Reading field:', fields[f]['name'], 'from', fields[f]['filename'] # save some variables for use in this function filename = fields[f]['filename'] arrays = fields[f]['arrayname'] fieldtype = fields[f]['ensprod'] if fieldtype in ['prob', 'neprob']: thresh = fields[f]['thresh'] if fieldtype[0:3]=='mem': member = int(fieldtype[3:]) # open Multi-file netcdf dataset if debug: print file_list[filename] fh = MFDataset(file_list[filename]) # loop through each field, wind fields will have two fields that need to be read datalist = [] for array in arrays: # read in 3D array (times*members,ny,nx) from file object if 'arraylevel' in fields[f]: if fields[f]['arraylevel'] != 'max': data = fh.variables[array][:,0,fields[f]['arraylevel'],:,:] else: data = np.amax(fh.variables[array][:,0,:,:,:], axis=1) #GSR else: data = fh.variables[array][:,0,:,:] # elif 'sfclevel' in fields[f]: data = fh.variables[array][:,:,:] else: data = fh.variables[array][:,0,:,:] # else: data = fh.variables[array][:,:,:] # change units for certain fields if array in ['U_GRID_PRS', 'V_GRID_PRS', 'UBSHR6','VBSHR6','U10','V10', 'U_COMP_STM', 'V_COMP_STM','S_PL']: data = data*1.93 # m/s > kt if array in ['mean_V10_d01','mean_U10_d01']: data = data*1.93*10.0 # m/s > .1 kt if array in ['MSL_PRES']: data = data/100. # mb if array in ['P_WAT']: data = data*0.0393701 # mb elif array in ['DEWPOINT_2M', 'T2', 'AFWA_WCHILL', 'AFWA_HEATIDX']: data = (data - 273.15)*1.8 + 32.0 # K > F elif array in ['PREC_ACC_NC', 'PREC_ACC_C', 'AFWA_PWAT', 'PWAT', 'AFWA_SNOWFALL', 'AFWA_SNOW', 'AFWA_ICE', 'AFWA_FZRA']: data = data*0.0393701 # mm > in elif array in ['RAINNC', 'GRPL_MAX', 'SNOW_ACC_NC']: data = data*0.0393701 # mm > in elif array in ['TEMP_PRS', 'DEWPOINT_PRS', 'SFC_LI']: data = data - 273.15 # K > C elif array in ['ABS_VORT_PRS']: data = data*100000.0 elif array in ['AFWA_MSLP', 'MSLP']: data = data*0.01 # Pa > hPa elif array in ['ECHOTOP']: data = data*0.001 # m > km elif array in ['SBCINH', 'MLCINH', 'W_DN_MAX']: data = data*-1.0 # make cin positive elif array in ['PVORT_320K']: data = data*1000000 # multiply by 1e6 elif array in ['SBT123_GDS3_NTAT']: data = data -273.15 # K -> C elif array in ['SBT124_GDS3_NTAT']: data = data -273.15 # K -> C elif array in ['HAIL_MAXK1', 'HAIL_MAX2D']: data = data*39.3701 # m -> inches elif array in ['mean_T2_d01']: data = data*1.8 # C->F elif array in ['T_LEV1']: data = data*1.8 + 32.0 # C->F # perform mean/max/variance/etc to reduce 3D array to 2D if (fieldtype == 'mean'): data = np.mean(data, axis=0) elif (fieldtype == 'pmm'): data = compute_pmm(data) elif (fieldtype == 'max'): data = np.amax(data, axis=0) elif (fieldtype == 'var'): data = np.std(data, axis=0) elif (fieldtype == 'summean'): for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape(data, (data.shape[0]/10,10,data.shape[1],data.shape[2])) data = np.nansum(data, axis=0) data = np.nanmean(data, axis=0) elif (fieldtype[0:3] == 'mem'): for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape(data, (data.shape[0]/10,10,data.shape[1],data.shape[2])) data = np.nanmax(data, axis=0) data = data[member-1,:] elif (fieldtype in ['prob', 'neprob']): data = (data>=thresh).astype('float') for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape(data, (data.shape[0]/10,10,data.shape[1],data.shape[2])) data = np.nanmax(data, axis=0) if (fieldtype == 'neprob'): data = compute_neprob(data, roi=14, sigma=40) #nw=neighborhood width else: data = np.nanmean(data, axis=0) data = data+0.001 #hack to ensure that plot displays discrete prob values if debug: print 'Returning', array, 'with shape', data.shape, 'max', data.max(), 'min', data.min() datalist.append(data) # attach data arrays for each type of field (e.g. { 'fill':[data], 'barb':[data,data] }) datadict[f] = datalist fh.close() # these are derived fields, we don't have in any of the input files but we can compute if 'name' in fields['fill']: if fields['fill']['name'] in ['shr06mag', 'shr01mag', 'bunkmag']: datadict['fill'] = [np.sqrt(datadict['fill'][0]**2 + datadict['fill'][1]**2)] if fields['fill']['name'] in ['iso300', 'iso500', 'iso700', 'iso850']: datadict['fill'] = [np.sqrt(datadict['fill'][0]**2 + datadict['fill'][1]**2)] elif fields['fill']['name'] == 'stp': datadict['fill'] = computestp(datadict['fill']) return (datadict, missing_list['amem'])
############################################################################## print sys.argv[1] if sys.argv[1] == '-h' or sys.argv[1] == '-help': help() sys.exit() # GPU SET UP - bus connection os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # GPU SET UP - visible gpu in this script os.environ["CUDA_VISIBLE_DEVICES"] = str(sys.argv[2]) num_gpu = str(sys.argv[2]).count(',') + 1 #LOADING DATA ncfile_r = MFDataset( '/home/aidl/git/data/data_final/long_train_10_years/clt/*.nc') maps = ncfile_r.variables['clt'] Nt, Ny, Nx = maps.shape print('{} maps ready to be loaded'.format(Nt)) ############################################################################## ############################################################################## ############################################################################## # Parameters to play with. # BATCH DEFINITION Nframes = 1 # Number of frames within each sequences to be used during training batch_size = 128 # Number of sequences to included within each batch during training epochs = 500
def ice_comp_model_to_sat(pathToModel, modelYear, modelIteration, \ boundLat, pathToOSI, param = 'area', threshold=0.15, coast_exp=False): ''' Plot sea ice area from satellite data and several model iterations ''' g = Dataset('./grid.cdf') dxc = g.variables['dxc'][0,:,:] dyc = g.variables['dyc'][0,:,:] lat = g.variables['yc'][0,:,:] topo = g.variables['topo'][0,:,:] dxcXdyc = dxc*dyc if coast_exp==True: topo2 = expand_coast(topo) area_model=np.zeros((len(modelIteration), 12)) for (it, iteration) in enumerate(modelIteration): fm = MFDataset(pathToModel+'/'+modelYear+'/'+'it'+str(iteration)+'/fw/*.cdf') for mm in range(12): if param == 'area': if expand_coast==True: temp_area = fm.variables['area'][mm,:,:] temp_area = np.ma.masked_array(temp_area, mask = topo2.mask) area_model[it,mm] = calc_area(temp_area,\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 else: area_model[it,mm] = calc_area(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 elif param == 'extent': if expand_coast==True: temp_area = fm.variables['area'][mm,:,:] temp_area = np.ma.masked_array(temp_area, mask = topo2.mask) area_model[it,mm] = calc_extent(temp_area,\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 else: area_model[it,mm] = calc_extent(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 fm.close() fsat = MFDataset(pathToOSI+modelYear+'??.nc') #osi_ice = fsat.variables['ice'][0,:,:] osi_area = [] for mm in range(12): if param == 'area': area_temp = fsat.variables['ice'][mm,:,:] if coast_exp==True: area_temp = np.ma.masked_array(area_temp, mask = topo2.mask) else: area_temp = np.ma.masked_array(area_temp, mask = topo.mask) area_temp = np.ma.masked_less_equal(area_temp, threshold) osi_area.append(calc_area(np.ma.filled(area_temp,0),\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11) elif param == 'extent': area_temp = fsat.variables['ice'][mm,:,:] if coast_exp==True: area_temp = np.ma.masked_array(area_temp, mask = topo2.mask) else: area_temp = np.ma.masked_array(area_temp, mask = topo.mask) area_temp = np.ma.masked_less_equal(area_temp, threshold) osi_area.append(calc_extent(np.ma.filled(area_temp,0),\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11) dates = pd.date_range(modelYear+'-01', str(int(modelYear)+1)+'-01', freq='M') dd = pd.DataFrame(index=dates) dd['Satellite']=osi_area for (it , iteration) in enumerate(modelIteration): dd['it'+str(iteration)]=area_model[it,:] return dd.plot(figsize=(10,5))
def update(self, FileName): #point to a new nc file or url without reinitializing everything if isinstance(FileName, list): self.Dataset = MFDataset(FileName) else: self.Dataset = Dataset(FileName)
def __init__(self, filename=None, name=None, gridfile=None): if filename is None: raise ValueError('Need filename as argument to constructor') # Map ROMS variable names to CF standard_name self.ROMS_variable_mapping = { # Removing (temoprarily) land_binary_mask from ROMS-variables, # as this leads to trouble with linearNDFast interpolation 'mask_rho': 'land_binary_mask', 'mask_psi': 'land_binary_mask', 'h': 'sea_floor_depth_below_sea_level', 'zeta': 'sea_surface_height', 'u': 'x_sea_water_velocity', 'v': 'y_sea_water_velocity', 'w': 'upward_sea_water_velocity', 'temp': 'sea_water_temperature', 'salt': 'sea_water_salinity', 'uice': 'sea_ice_x_velocity', 'vice': 'sea_ice_y_velocity', 'aice': 'sea_ice_area_fraction', 'hice': 'sea_ice_thickness', 'gls': 'turbulent_generic_length_scale', 'tke': 'turbulent_kinetic_energy', 'AKs': 'ocean_vertical_diffusivity', 'sustr': 'surface_downward_x_stress', 'svstr': 'surface_downward_y_stress', 'Uwind': 'x_wind', 'Vwind': 'y_wind' } # z-levels to which sigma-layers may be interpolated self.zlevels = np.array([ 0, -.5, -1, -3, -5, -10, -25, -50, -75, -100, -150, -200, -250, -300, -400, -500, -600, -700, -800, -900, -1000, -1500, -2000, -2500, -3000, -3500, -4000, -4500, -5000, -5500, -6000, -6500, -7000, -7500, -8000 ]) gls_param = ['gls_cmu0', 'gls_p', 'gls_m', 'gls_n'] filestr = str(filename) if name is None: self.name = filestr else: self.name = name try: # Open file, check that everything is ok self.logger.info('Opening dataset: ' + filestr) if ('*' in filestr) or ('?' in filestr) or ('[' in filestr): self.logger.info('Opening files with MFDataset') def drop_non_essential_vars_pop(ds): dropvars = [ v for v in ds.variables if v not in list(self.ROMS_variable_mapping.keys()) + gls_param + ['ocean_time', 's_rho', 'Cs_r', 'hc', 'angle'] and v[0:3] not in ['lon', 'lat', 'mas'] ] self.logger.debug('Dropping variables: %s' % dropvars) ds = ds.drop(dropvars) return ds if has_xarray is True: self.Dataset = xr.open_mfdataset( filename, chunks={'ocean_time': 1}, concat_dim='ocean_time', preprocess=drop_non_essential_vars_pop, data_vars='minimal', coords='minimal') else: self.Dataset = MFDataset(filename) else: self.logger.info('Opening file with Dataset') if has_xarray is True: self.Dataset = xr.open_dataset(filename) else: self.Dataset = Dataset(filename, 'r') except Exception as e: raise ValueError(e) if 's_rho' not in self.Dataset.variables: dimensions = 2 else: dimensions = 3 if dimensions == 3: # Read sigma-coordinate values try: self.sigma = self.Dataset.variables['s_rho'][:] except: num_sigma = len(self.Dataset.dimensions['s_rho']) self.logger.warning( 's_rho not available in dataset, constructing from' ' number of layers (%s).' % num_sigma) self.sigma = (np.arange(num_sigma) + .5 - num_sigma) / num_sigma # Read sigma-coordinate transform parameters try: self.Dataset.variables['Cs_r'].set_auto_mask(False) except: pass self.Cs_r = self.Dataset.variables['Cs_r'][:] try: self.hc = self.Dataset.variables['hc'][:] except: if has_xarray is True: self.hc = self.Dataset.variables['hc'].data # scalar else: self.hc = self.Dataset.variables['hc'][0] self.num_layers = len(self.sigma) else: self.num_layers = 1 self.ROMS_variable_mapping['ubar'] = 'x_sea_water_velocity' self.ROMS_variable_mapping['vbar'] = 'y_sea_water_velocity' del self.ROMS_variable_mapping['u'] del self.ROMS_variable_mapping['v'] if 'lat_rho' in self.Dataset.variables: # Horizontal oordinates and directions self.lat = self.Dataset.variables['lat_rho'][:] self.lon = self.Dataset.variables['lon_rho'][:] else: if gridfile is None: raise ValueError(filename + ' does not contain lon/lat ' 'arrays, please supply a grid-file ' '"gridfile=<grid_file>"') else: gf = Dataset(gridfile) self.lat = gf.variables['lat_rho'][:] self.lon = gf.variables['lon_rho'][:] try: # Check for GLS parameters (diffusivity) self.gls_parameters = {} for gls_par in gls_param: self.gls_parameters[gls_par] = \ self.Dataset.variables[gls_par][()] self.logger.info('Read GLS parameters from file.') except Exception as e: self.logger.info(e) self.logger.info('Did not find complete set of GLS parameters') # Get time coverage try: ocean_time = self.Dataset.variables['ocean_time'] except: ocean_time = self.Dataset.variables['time'] if has_xarray: self.times = [ datetime.utcfromtimestamp( (OT - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')) for OT in ocean_time.data ] else: time_units = ocean_time.__dict__['units'] if time_units == 'second': self.logger.info( 'Ocean time given as seconds relative to start ' 'Setting artifical start time of 1 Jan 2000.') time_units = 'seconds since 2000-01-01 00:00:00' self.times = num2date(ocean_time[:], time_units) self.start_time = self.times[0] self.end_time = self.times[-1] if len(self.times) > 1: self.time_step = self.times[1] - self.times[0] else: self.time_step = None # x and y are rows and columns for unprojected datasets self.xmin = 0. self.delta_x = 1. self.ymin = 0. self.delta_y = 1. if has_xarray: self.xmax = self.Dataset['xi_rho'].shape[0] - 1. self.ymax = self.Dataset['eta_rho'].shape[0] - 1. self.lon = self.lon.data # Extract, could be avoided downstream self.lat = self.lat.data self.sigma = self.sigma.data else: self.xmax = np.float(len(self.Dataset.dimensions['xi_rho'])) - 1 self.ymax = np.float(len(self.Dataset.dimensions['eta_rho'])) - 1 self.name = 'roms native' self.precalculate_s2z_coefficients = True # Find all variables having standard_name self.variables = [] for var_name in self.Dataset.variables: if var_name in self.ROMS_variable_mapping.keys(): var = self.Dataset.variables[var_name] self.variables.append(self.ROMS_variable_mapping[var_name]) # Run constructor of parent Reader class super(Reader, self).__init__()
def get_tz_crosssection_for_the_point(self, lon=None, lat=None, zlist=None, var_name="", start_date=None, end_date=None): """ get t-z cross section matrix for the point on the zlist levels Note: if zlist is None, the profiles are returned on model levels :param lon: :param lat: :param zlist: :param var_name: :param start_date: :param end_date: """ if self.model_kdtree is None: xs, ys, zs = lat_lon.lon_lat_to_cartesian(self.lons.flatten(), self.lats.flatten()) self.model_kdtree = cKDTree(list(zip(xs, ys, zs))) xt, yt, zt = lat_lon.lon_lat_to_cartesian(lon, lat) start_year = start_date.year end_year = end_date.year # Get 4 nearest neighbors for interpolation dists_from, inds_from = self.model_kdtree.query([ (xt, yt, zt), ], k=1) # Calculate the inverse of squre of distance for weighted average weights = 1.0 / dists_from**2 weights /= weights.sum() inds_from = inds_from.squeeze() weights = weights.squeeze() if len(weights.shape) == 0: weights = [ weights, ] neighbor_lons = self.lons.flatten()[inds_from] neighbor_lats = self.lats.flatten()[inds_from] i_list, j_list = [], [] if dists_from.ndim > 1: for the_lon, the_lat in zip(neighbor_lons, neighbor_lats): i, j = np.where((self.lons == the_lon) & (self.lats == the_lat)) i_list.append(i[0]) j_list.append(j[0]) else: i, j = np.where((self.lons == neighbor_lons) & (self.lats == neighbor_lats)) i_list.append(i[0]) j_list.append(j[0]) profiles = [] dates = [] ztarget = np.asarray(zlist) if zlist is not None else None vert_kdtree = None for the_year in range(start_year, end_year + 1): # cube dimensions (t, z, y, x) print("treating the following files: {}".format(", ".join( self.year_to_path[the_year]))) with MFDataset(self.year_to_path[the_year]) as ds: data = ds.variables[var_name] time_var = ds.variables["time_counter"] time_data = time_var[:] assert np.all( time_data == np.array(sorted(time_data)) ), "Time data is not sorted: {}".format(time_data) if end_date.hour == 0: end_date += timedelta(days=1) d1 = date2num(start_date, time_var.units) d2 = date2num(end_date, time_var.units) current_dates = num2date( [t for t in time_var[:] if d1 <= t <= d2], units=time_var.units) data = data[np.where((d1 <= time_var[:]) & (time_var[:] <= d2))[0], :, :, :] # Use inverse squared distances to interpolate in horizontal prof = data[:, :, j_list[0], i_list[0]] * weights[0] for i, j, weight in zip(i_list[1:], j_list[1:], weights[1:]): prof += data[:, :, j, i] * weight # Linear interpolation in vertical if "deptht" in ds.variables: zsource = ds.variables["deptht"][:] elif "depthu" in ds.variables: zsource = ds.variables["depthu"][:] elif "depthv" in ds.variables: zsource = ds.variables["depthv"][:] elif "depthw" in ds.variables: zsource = ds.variables["depthw"][:] else: raise Exception("Could not find vertical coordinate") if vert_kdtree is None: vert_kdtree = cKDTree([[ z, ] for z in zsource]) # No interpolation if the vertical levels are not supplied ztarget = zsource if ztarget is None else ztarget zdists, zinds = vert_kdtree.query([[ z, ] for z in ztarget], k=2) zdists = zdists.squeeze() zinds = zinds.squeeze() zweights = zdists / zdists.sum( axis=1)[:, np.newaxis] # weight1 = d2/(d1 + d2) prof = prof[:, zinds[:, 0]] * zweights[ np.newaxis, :, 1] + prof[:, zinds[:, 1]] * zweights[np.newaxis, :, 0] profiles.extend(prof) print("Selected data for the time range: ", current_dates[0], current_dates[-1]) print("The limits are ", start_date, end_date) dates.extend(current_dates) # Calculate model bottom bottom = 0 for i, j in zip(i_list, j_list): bottom += self.bathymetry[i, j] bottom /= float(len(i_list)) dates_num = mdates.date2num(dates) profiles = np.asarray(profiles) # mask everything below the model bottom if zlist is None and False: profiles = profiles[:, np.where(ztarget <= bottom)] profiles = profiles.squeeze() ztarget = ztarget[ztarget <= bottom] zz, tt = np.meshgrid(ztarget, dates_num) # print("nemo tt-ranges: ", tt.min(), tt.max()) # profiles = np.ma.masked_where(zz > bottom, profiles) # plot for debug # # plt.figure() # ax = plt.gca() # # im = ax.contourf(profiles, levels=np.arange(4, 30, 1)) # # # # xlimits = ax.get_xlim() # ax.plot(xlimits, [bottom, bottom], "k-", lw=2) # # # ax.invert_yaxis() # # ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y\n%b\n%d")) # # plt.colorbar(im) # plt.show() # # if True: # raise Exception() return tt, zz, profiles
def get_seasonal_clim_field(self, start_year=None, end_year=None, season_to_months=None, varname="sosstsst", level_index=0): """ Get seasonal mean climatology for a field :param start_year: :param end_year: :param season_to_months: :param varname: """ if start_year is None: start_year = min(self.year_to_path.keys()) if end_year is None: end_year = max(self.year_to_path.keys()) # Set up month to season relation month_to_season = defaultdict(lambda: "no-season") for m in range(1, 13): for s, months in season_to_months.items(): if m in months: month_to_season[m] = s break season_to_field_list = defaultdict(list) for y in range(start_year, end_year + 1): fpath = self.year_to_path[y] with MFDataset(fpath) as ds: data_var = ds.variables[varname] if len(data_var.shape) == 3: nt, ny, nx = data_var.shape data = data_var[:] elif len(data_var.shape) == 4: nt, nz, ny, nx = data_var.shape data = data_var[:, level_index, :, :] else: raise Exception( "Do not know how to handle {}-dimensional fields". format(len(data_var.shape))) time_var = ds.variables["time_counter"] dates = num2date(time_var[:], time_var.units) panel = pd.Panel(data=data, items=dates, major_axis=range(ny), minor_axis=range(nx)) seas_mean = panel.groupby(lambda d: month_to_season[d.month], axis="items").mean() for the_season in seas_mean: season_to_field_list[the_season].append( seas_mean[the_season].values) result = {} for the_season, field_list in season_to_field_list.items(): mean_field = np.mean(field_list, axis=0).transpose() print(mean_field.shape) result[the_season] = np.ma.masked_where(~self.lake_mask, mean_field) return result
def get_seasonal_clim_field_for_dates( self, start_year=None, end_year=None, season_to_months=None, varname="sosstsst", level_index=0, season_to_selected_dates: dict = None): """ :param start_year: :param end_year: :param season_to_months: :param varname: :param level_index: :param season_to_selected_dates: :return: {season: (clim, std, nobs)} """ # presort selected dates for season in season_to_selected_dates.keys(): season_to_selected_dates[season] = sorted( season_to_selected_dates[season]) def __check_if_date_isinlist(d1, dlist): """ :param d1: :param dlist: (should be sorted ascending) :return: """ if len(dlist) >= 2: if d1 < dlist[0] or d1 > dlist[-1]: return False return datetime(d1.year, d1.month, d1.day) in dlist if start_year is None: start_year = min(self.year_to_path.keys()) if end_year is None: end_year = max(self.year_to_path.keys()) # Set up month to season relation month_to_season = defaultdict(lambda: "no-season") for m in range(1, 13): for s, months in season_to_months.items(): if m in months: month_to_season[m] = s break # selection of the dates of interest for a season def __get_selected_dates_for_month(month): aseason = month_to_season[month] if aseason in season_to_selected_dates: return season_to_selected_dates[aseason] return [] season_to_field_list = defaultdict(list) for y in range(start_year, end_year + 1): fpath = self.year_to_path[y] with MFDataset(fpath) as ds: data_var = ds.variables[varname] if len(data_var.shape) == 3: nt, ny, nx = data_var.shape data = data_var[:] elif len(data_var.shape) == 4: nt, nz, ny, nx = data_var.shape data = data_var[:, level_index, :, :] else: raise Exception( "Do not know how to handle {}-dimensional fields". format(len(data_var.shape))) time_var = ds.variables["time_counter"] dates = num2date(time_var[:], time_var.units) panel = pd.Panel(data=data, items=dates, major_axis=range(ny), minor_axis=range(nx)) seas_mean = panel.groupby( lambda d: month_to_season[d.month] if __check_if_date_isinlist( d, __get_selected_dates_for_month(d.month) ) else "no-season", axis="items").mean() print(seas_mean) for the_season in seas_mean: season_to_field_list[the_season].append( seas_mean[the_season].values) result = {} for the_season, field_list in season_to_field_list.items(): mean_field = np.mean(field_list, axis=0).transpose() std_field = np.std(field_list, axis=0).transpose() nobs = len(field_list) print(mean_field.shape) result[the_season] = (np.ma.masked_where(~self.lake_mask, mean_field), std_field, nobs) return result
def get_seasonal_clim_cross_section_with_ttest_data( self, start_year=None, end_year=None, season_to_months=None, varname="votemper", start_point=None, end_point=None): """ :param start_year: :param end_year: :param season_to_months: :param varname: :param start_point: :param end_point: """ if start_year is None: start_year = min(self.year_to_path.keys()) if end_year is None: end_year = max(self.year_to_path.keys()) # Set up month to season relation month_to_season = defaultdict(lambda: "no-season") for m in range(1, 13): for s, months in season_to_months.items(): if m in months: month_to_season[m] = s break season_to_field_list = defaultdict(list) for y in range(start_year, end_year + 1): fpath = self.year_to_path[y] with MFDataset(fpath) as ds: data_var = ds.variables[varname] assert data_var.ndim == 4 data = data_var[:] # (t, z, y, x) nt, nz, ny, nx = data.shape time_var = ds.variables["time_counter"] dates = num2date(time_var[:], time_var.units) panel = pd.Panel4D(data=data, labels=dates, items=range(nz), major_axis=range(ny), minor_axis=range(nx)) seas_mean = panel.groupby(lambda d: month_to_season[d.month], axis="labels").mean() print(seas_mean) for the_season in seas_mean: season_to_field_list[the_season].append( seas_mean[the_season].values) result = {} for the_season, field_list in season_to_field_list.items(): mean_field = np.mean(field_list, axis=0).transpose((0, 2, 1)) std_field = np.std(field_list, axis=0).transpose((0, 2, 1)) nobs = len(field_list) print(mean_field.shape) result[the_season] = (np.ma.masked_where(~self.lake_mask, mean_field), std_field, nobs) return result
def test_get_by_mfdataset(self): """testing multi-file get_variables_by_attributes.""" f = MFDataset(self.files, check=True) assert f.get_variables_by_attributes(axis='T') == [] f.get_variables_by_attributes(units='zlotys')[0] == f['x'] f.close()
# Ocean heat capacity (ocean_core/ocean_parameters.F90) cp_ocean = 3992.10322329649 # Read 'descriptor' and 'years' from external file f = open("files.txt") for line in f.readlines(): exec(line.lstrip()) f.close() model_label = "%s (%s)" % (descriptor,years) # TMPDIR where input files are located tmpdir = "./" # Open input files #fstatic = Dataset(tmpdir+'19000101.ocean_geometry.nc', 'r') fstatic = MFDataset(tmpdir+'*.ocean_static.nc') ftemp = MFDataset(tmpdir+'*.ocean_annual.nc') # Time info time = ftemp.variables["time"] ntimes = len(time[:]) date = num2date(time,time.units,time.calendar.lower()) year = [d.year for d in date] time_days = date2num(date,'days since 01-01-0001',time.calendar.lower()) # Grid info #area = fstatic.variables["Ah"][:] area = fstatic.variables["area_t"][:] z = ftemp.variables["zl"][:]
class HighResDataManager(object): def __init__(self, path="", vname="", characteristic_scale_deg=0.01, chunks=(5, 500, 500)): self.chunks = chunks try: self.__ds = Dataset(path) self.data = da.from_array(Dataset(path).variables[vname], self.chunks, lock=True) except OSError as err: import glob if isinstance(path, str): path_list = glob.glob(path) else: path_list = path path_list = sorted(path_list) self.data = [ da.from_array(Dataset(p).variables[vname], self.chunks, lock=True) for p in path_list ] self.data = da.concatenate(self.data) try: self.__ds = MFDataset(path_list) except ValueError as verr: print( "Warning: Could not use MFDataset from netCDF4, trying xarray" ) self.__ds = xarray.concat([ xarray.open_dataset(p, chunks={"time": 100}) for p in sorted(path_list) ], data_vars="minimal", dim="time") self.missing_value = None if hasattr(self.__ds.variables[vname], "missing_value"): self.missing_value = self.__ds.variables[vname].missing_value else: self.missing_value = np.nan self.vname = vname # # self.data = biggus.OrthoArrayAdapter(self.ds.variables[vname]) self.lons = None self.lats = None self.time = None self.time_to_index = None self.characteristic_scale_deg = characteristic_scale_deg self.__read_coordinates_and_time() self.__ds.close() def get_data_aggregated_in_space(self, chunk_size): return self.data.rechunk(chunks=chunk_size).map_blocks() def get_annual_max_with_ttest_stats_lazy(self, data, start_year=-np.Inf, end_year=np.Inf): """ Get the maximum for each year, calculate clim_mean and standard deviation, to be able to use the in ttest :param data: :param start_year: :param end_year: :return (mean of ann max, std of ann max, nyears), mask """ data_sel, time_sel = self.__sel_period(start_year=start_year, end_year=end_year, arr=data) data_sel = data_sel.rechunk((len(time_sel), ) + self.chunks.shape[1:]) mask = np.abs(data_sel[0, :, :] - self.missing_value) < 1.0e-6 def annual_max(block): tmp = block.reshape((len(time_sel), -1)) df = pd.DataFrame(index=time_sel, data=tmp) return df.groupby(lambda d: d.year, sort=True).max().values.reshape((-1, ) + block.shape[1:]) ann_max_arr = data_sel.map_blocks(annual_max) # get climatology and standard deviations ann_max_mean_clim = ann_max_arr.mean(axis=0) ann_max_std = ann_max_arr.std(axis=0) return ann_max_mean_clim, ann_max_std, ann_max_arr.shape[0], mask def get_daily_percenile_fields_interpolated_to( self, lons_target, lats_target, start_year=-np.Inf, end_year=np.Inf, percentile=0.5, rolling_mean_window_days=None): target_scale_deg = (lons_target[1, 1] - lons_target[0, 0] + lats_target[1, 1] - lats_target[0, 0]) / 2.0 coarsening = int(target_scale_deg / self.characteristic_scale_deg + 0.5) print("source_scale: {}\ntarget_scale: {}\ncoarsening coefficient: {}". format(self.characteristic_scale_deg, target_scale_deg, coarsening)) def coarsening_func(x, axis=None): _mask = np.less(np.abs(x - self.missing_value), 1.0e-6) if np.all(_mask): return self.missing_value * np.ma.ones( _mask.shape).mean(axis=axis) y = np.ma.masked_where(_mask, x) return y.mean(axis=axis) # aggregate the data trim_excess = True data = da.coarsen(coarsening_func, self.data, axes={ 1: coarsening, 2: coarsening }, trim_excess=trim_excess) lons_s = da.coarsen(np.mean, da.from_array(self.lons, self.chunks[1:]), axes={ 0: coarsening, 1: coarsening }, trim_excess=trim_excess).compute() lats_s = da.coarsen(np.mean, da.from_array(self.lats, self.chunks[1:]), axes={ 0: coarsening, 1: coarsening }, trim_excess=trim_excess).compute() source_grid = list( zip(*lat_lon.lon_lat_to_cartesian(lons_s.flatten(), lats_s.flatten()))) print(np.shape(source_grid)) ktree = KDTree(source_grid) dists, inds = ktree.query( list( zip(*lat_lon.lon_lat_to_cartesian(lons_target.flatten(), lats_target.flatten())))) perc_daily, mask = self.get_daily_percenile_fields_lazy( data, start_year=start_year, end_year=end_year, percentile=percentile, rolling_mean_window_days=rolling_mean_window_days) print("perc_daily.shape=", perc_daily.shape) # do the interpolation for each day perc_daily_interpolated = [] for perc_field in perc_daily: print(perc_field.shape) field = np.ma.masked_where( mask, perc_field.compute()).flatten()[inds].reshape( lons_target.shape) perc_daily_interpolated.append(field) return np.array(perc_daily_interpolated) # @profile def get_daily_percenile_fields_lazy(self, data, start_year=-np.Inf, end_year=np.Inf, percentile=0.5, rolling_mean_window_days=None): """ calculate the percentile for each day of year for the specified period :param rolling_mean_window_days: if None[default] the rolling mean is not applied, if 1 or N - the rolling mean of 1 or N days is applied before computing the percentile :param percentile: ranges from 0 to 1.0 :param data: (time, lon, lat) dask array :param start_year: :param end_year: :return : 365 mean fields (1 for each day of year) of <var>percentile</var> percentile, and the mask """ assert isinstance(data, da.Array) msg = "The first dimension of data, should be time, but data.shape[0]={} and len(self.time)={}".format( data.shape[0], len(self.time)) assert data.shape[0] == len(self.time), msg # mask the resulting fields epsilon = 1.0e-5 print("missing_value = {}, isnan(..) = {}".format( self.missing_value, np.isnan(self.missing_value))) if not np.isnan(self.missing_value): mask = np.less_equal(np.abs(data[0, :, :] - self.missing_value), epsilon) else: mask = np.isnan(data[0, :, :].compute()) data_sel, time_sel = data, self.time assert np.all( np.equal(sorted(time_sel), time_sel)), "Time vector does not appear to be sorted" print("start rechunking") initial_chunks = tuple(data_sel.chunks) data_sel = data_sel.rechunk((len(time_sel), ) + data_sel.chunks[1:]) print("finish rechunking: {} ---> {}".format(initial_chunks, data_sel.chunks)) perc = data_sel.map_blocks( clim_day_percentile_calculator, time_sel, dtype=np.float32, rolling_mean_window_days=rolling_mean_window_days, percentile=percentile, start_year=start_year, end_year=end_year, missing_value=self.missing_value) return perc, mask def get_seasonal_means_with_ttest_stats_interpolated_to( self, lons_target, lats_target, season_to_monthperiod=None, start_year=-np.Inf, end_year=np.Inf, convert_monthly_accumulators_to_daily=False): """ :param lons_target, lats_target: 2d arrays of target longitudes and latitudes :param season_to_monthperiod: :param start_year: :param end_year: :param convert_monthly_accumulators_to_daily: if true converts monthly accumulators to daily, :return dict(season: [mean, std, nobs]) # coarsen the data and coordinates to the target scale and interpolate using nearest neighbours """ target_scale_deg = (lons_target[1, 1] - lons_target[0, 0] + lats_target[1, 1] - lats_target[0, 0]) / 2.0 coarsening = int(target_scale_deg / self.characteristic_scale_deg + 0.5) print("source_scale: {}\ntarget_scale: {}\ncoarsening coefficient: {}". format(self.characteristic_scale_deg, target_scale_deg, coarsening)) def coarsening_func(x, axis=None): _mask = np.less(np.abs(x - self.missing_value), 1.0e-6) if np.all(_mask): return self.missing_value * np.ma.ones( _mask.shape).mean(axis=axis) y = np.ma.masked_where(_mask, x) return y.mean(axis=axis) # aggregate the data trim_excess = True data = da.coarsen(coarsening_func, self.data, axes={ 1: coarsening, 2: coarsening }, trim_excess=trim_excess) lons_s = da.coarsen(np.mean, da.from_array(self.lons, self.chunks[1:]), axes={ 0: coarsening, 1: coarsening }, trim_excess=trim_excess).compute() lats_s = da.coarsen(np.mean, da.from_array(self.lats, self.chunks[1:]), axes={ 0: coarsening, 1: coarsening }, trim_excess=trim_excess).compute() source_grid = list( zip(*lat_lon.lon_lat_to_cartesian(lons_s.flatten(), lats_s.flatten()))) print(np.shape(source_grid)) ktree = KDTree(source_grid) dists, inds = ktree.query( list( zip(*lat_lon.lon_lat_to_cartesian(lons_target.flatten(), lats_target.flatten())))) print("data.shape = ", data.shape) result, mask = self.__get_seasonal_means_with_ttest_stats_dask_lazy( data, season_to_monthperiod=season_to_monthperiod, start_year=start_year, end_year=end_year, convert_monthly_accumulators_to_daily= convert_monthly_accumulators_to_daily) # invoke the computations and interpolate the result for season in result: print("Computing for {}".format(season)) for i in range(len(result[season]) - 1): result[season][i] = np.ma.masked_where( mask, result[season][i].compute()).flatten()[inds].reshape( lons_target.shape) return result def __read_coordinates_and_time(self): coord_name_tokens = ["lon", "lat", "time"] for nc_vname, nc_var in self.__ds.variables.items(): vname_lc = nc_vname.lower() print(vname_lc, type(vname_lc)) print(nc_var) skip = False # avoid loading large variables if nc_var.ndim > 2: skip = True print(nc_var.ndim, nc_var.shape) # avoid variables which do not contain lon, lat or time if not skip: present = False for t in coord_name_tokens: present = present or (t in vname_lc) skip = not present if skip: print("Skipping {}".format(vname_lc)) continue # make sure that this is really a numpy array data = nc_var[:] if hasattr(data, "values"): data = data.values if "lon" in vname_lc: self.lons = data elif "lat" in vname_lc: self.lats = data elif "time" in vname_lc and "bnds" not in vname_lc: # check if the time data are already in some kind of date objects if isinstance(nc_var, xarray.IndexVariable): self.time = data else: if not hasattr(nc_var, "calendar"): self.time = num2date(data, nc_var.units) else: print( "Found the calendar attribute, using calendar={}". format(nc_var.calendar)) self.time = num2date(data, nc_var.units, calendar=nc_var.calendar) if self.lons.ndim == 1: self.lats, self.lons = np.meshgrid(self.lats, self.lons) if self.lons.shape != self.data.shape[1:]: print( "Transposing data, since self.lons.shape={} and self.data.shape={}" .format(self.lons.shape, self.data.shape)) print(type(self.data)) self.data = self.data.transpose(axes=[0, 2, 1]) def get_seasonal_means_with_ttest_stats( self, season_to_monthperiod=None, start_year=None, end_year=None, convert_monthly_accumulators_to_daily=False): """ :param season_to_monthperiod: :param start_year: :param end_year: :param convert_monthly_accumulators_to_daily: if true converts monthly accumulators to daily, :return dict(season: [mean, std, nobs]) """ if True: raise NotImplementedError( "Biggus way of calculation is not implemented, use the dask version of the method" ) # select the interval of interest timesel = [ i for i, d in enumerate(self.time) if start_year <= d.year <= end_year ] data = self.data[timesel, :, :] times = [self.time[i] for i in timesel] if convert_monthly_accumulators_to_daily: ndays = np.array( [calendar.monthrange(d.year, d.month)[1] for d in times]) data = biggus.divide(data, ndays[:, np.newaxis, np.newaxis]) else: data = self.data year_month_to_index_arr = defaultdict(list) for i, t in enumerate(times): year_month_to_index_arr[t.year, t.month].append(i) # calculate monthly means monthly_data = {} for y in range(start_year, end_year + 1): for m in range(1, 13): aslice = slice(year_month_to_index_arr[y, m][0], year_month_to_index_arr[y, m][-1] + 1) monthly_data[y, m] = biggus.mean( data[aslice.start:aslice.stop, :, :], axis=0) result = {} for season, month_period in season_to_monthperiod.items(): assert isinstance(month_period, MonthPeriod) seasonal_means = [] ndays_per_season = [] for p in month_period.get_season_periods(start_year=start_year, end_year=end_year): lmos = biggus.ArrayStack([ monthly_data[start.year, start.month] for start in p.range("months") ]) ndays_per_month = np.array([ calendar.monthrange(start.year, start.month)[1] for start in p.range("months") ]) seasonal_mean = biggus.sum(biggus.multiply( lmos, ndays_per_month[:, np.newaxis, np.newaxis]), axis=0) seasonal_mean = biggus.divide(seasonal_mean, ndays_per_month.sum()) seasonal_means.append(seasonal_mean) ndays_per_season.append(ndays_per_month.sum()) seasonal_means = biggus.ArrayStack(seasonal_means) ndays_per_season = np.array(ndays_per_season) print(seasonal_means.shape, ndays_per_season.shape) assert seasonal_means.shape[0] == ndays_per_season.shape[0] clim_mean = biggus.sum(biggus.multiply( seasonal_means, ndays_per_season[:, np.newaxis, np.newaxis]), axis=0) / ndays_per_season.sum() diff = biggus.subtract(seasonal_means, clim_mean.masked_array()[np.newaxis, :, :]) sq_mean = biggus.sum(biggus.multiply( diff**2, ndays_per_season[:, np.newaxis, np.newaxis]), axis=0) / ndays_per_season.sum() clim_std = biggus.power(sq_mean, 0.5) clim_mean = clim_mean.masked_array() print("calculated mean") clim_std = clim_std.masked_array() print("calculated std") result[season] = [clim_mean, clim_std, ndays_per_season.shape[0]] return result def __sel_period(self, start_year, end_year, arr): timesel = [ i for i, d in enumerate(self.time) if start_year <= d.year <= end_year ] data = arr[timesel] times = [self.time[i] for i in timesel] return data, times def __get_seasmean_cache_file(self, season_to_month_period, start_year=-np.Inf, end_year=np.Inf): seas_tok = "_".join(season_to_month_period) year_tok = "{}-{}".format(start_year, end_year) return "DAYMET_seas__{}__{}.bin".format(seas_tok, year_tok) def __get_seasonal_means_with_ttest_stats_dask_lazy( self, data, season_to_monthperiod=None, start_year=-np.Inf, end_year=np.Inf, convert_monthly_accumulators_to_daily=False): # mask the resulting fields epsilon = 1.0e-5 mask = np.less_equal(np.abs(data[0, :, :] - self.missing_value), epsilon) print("data.shape = ", data.shape) data_sel, times_sel = data, self.time # select the interval of interest if convert_monthly_accumulators_to_daily: ndays = da.from_array( np.array([ calendar.monthrange(d.year, d.month)[1] for d in times_sel ]), (100, )) ndays = da.transpose(da.broadcast_to( da.from_array(ndays, ndays.shape), data_sel.shape[1:] + ndays.shape), axes=(2, 0, 1)) data_sel = data_sel / ndays year_month_to_index_arr = defaultdict(list) for i, t in enumerate(times_sel): year_month_to_index_arr[t.year, t.month].append(i) # calculate monthly means monthly_data = {} for y in range(start_year, end_year + 1): for m in range(1, 13): aslice = slice(year_month_to_index_arr[y, m][0], year_month_to_index_arr[y, m][-1] + 1) print(aslice, data_sel.shape) monthly_data[y, m] = data_sel[aslice, :, :].mean(axis=0) result = OrderedDict() for season, month_period in season_to_monthperiod.items(): assert isinstance(month_period, MonthPeriod) seasonal_means = [] ndays_per_season = [] for p in month_period.get_season_periods(start_year=start_year, end_year=end_year): lmos = da.stack([ monthly_data[start.year, start.month] for start in p.range("months") ]) ndays_per_month = np.array([ calendar.monthrange(start.year, start.month)[1] for start in p.range("months") ]) ndays_per_month = da.from_array(ndays_per_month, ndays_per_month.shape) print(p) print(lmos.shape, ndays_per_month.shape, ndays_per_month.sum()) seasonal_mean = da.tensordot( lmos, ndays_per_month, axes=([ 0, ], [ 0, ])) / ndays_per_month.sum() seasonal_means.append(seasonal_mean) ndays_per_season.append(ndays_per_month.sum()) seasonal_means = da.stack(seasonal_means) ndays_per_season = np.array(ndays_per_season) ndays_per_season = da.from_array(ndays_per_season, ndays_per_season.shape) print(seasonal_means.shape, ndays_per_season.shape) assert seasonal_means.shape[0] == ndays_per_season.shape[0] clim_mean = da.tensordot( seasonal_means, ndays_per_season, axes=([ 0, ], [ 0, ])) / ndays_per_season.sum() clim_std = ((seasonal_means - da.broadcast_to(clim_mean, seasonal_means.shape))**2 * ndays_per_season[:, np.newaxis, np.newaxis]).sum( axis=0) / ndays_per_season.sum() clim_std = clim_std**0.5 result[season] = [clim_mean, clim_std, ndays_per_season.shape[0]] return result, mask def get_seasonal_means_with_ttest_stats_dask( self, season_to_monthperiod=None, start_year=-np.Inf, end_year=np.Inf, convert_monthly_accumulators_to_daily=False): """ :param season_to_monthperiod: :param start_year: :param end_year: :param convert_monthly_accumulators_to_daily: if true converts monthly accumulators to daily, :return dict(season: [mean, std, nobs]) """ result, mask = self.__get_seasonal_means_with_ttest_stats_dask_lazy( self.data, season_to_monthperiod=season_to_monthperiod, start_year=start_year, end_year=end_year, convert_monthly_accumulators_to_daily= convert_monthly_accumulators_to_daily) for season in result: print("Computing for {}".format(season)) for i in range(len(result[season]) - 1): # -1 because the last one is for the result[season][i] = np.ma.masked_where( mask, result[season][i].compute()) return result def close(self): del self
def ice_comp_model_to_sat_table(pathToModel, modelYears, modelIteration,\ boundLat, pathToOSI, param = 'area', threshold=0.15, coast_exp=False): diff_array = numpy.zeros((len(modelYears), 12)) for (nnum, yyear) in enumerate(modelYears): g = Dataset('./grid.cdf') dxc = g.variables['dxc'][0,:,:] dyc = g.variables['dyc'][0,:,:] lat = g.variables['yc'][0,:,:] topo = g.variables['topo'][0,:,:] dxcXdyc = dxc*dyc if coast_exp==True: topo2 = expand_coast(topo) area_model=np.zeros((len(modelIteration), 12)) if modelIteration[0] == 'last': gg = glob.glob(pathToModel+'/'+yyear+'/'+'it*') gg.sort() lastit = [int(gg[-1].split('/')[-1].split('t')[-1])] else: lastit = modelIteration for (it, iteration) in enumerate(lastit): fm = MFDataset(pathToModel+'/'+yyear+'/'+'it'+str(iteration)+'/fw/*.cdf') for mm in range(12): if param == 'area': if expand_coast==True: temp_area = fm.variables['area'][mm,:,:] temp_area = np.ma.masked_array(temp_area, mask = topo2.mask) area_model[it,mm] = calc_area(temp_area,\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 else: area_model[it,mm] = calc_area(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 elif param == 'extent': if expand_coast==True: temp_area = fm.variables['area'][mm,:,:] temp_area = np.ma.masked_array(temp_area, mask = topo2.mask) area_model[it,mm] = calc_extent(temp_area,\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 else: area_model[it,mm] = calc_extent(fm.variables['area'][mm,:,:],\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11 fm.close() fsat = MFDataset(pathToOSI+yyear+'??.nc') osi_area = [] for mm in range(12): if param == 'area': area_temp = fsat.variables['ice'][mm,:,:] if coast_exp==True: area_temp = np.ma.masked_array(area_temp, mask = topo2.mask) else: area_temp = np.ma.masked_array(area_temp, mask = topo.mask) area_temp = np.ma.masked_less_equal(area_temp, threshold) osi_area.append(calc_area(np.ma.filled(area_temp,0),\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11) elif param == 'extent': area_temp = fsat.variables['ice'][mm,:,:] if coast_exp==True: area_temp = np.ma.masked_array(area_temp, mask = topo2.mask) else: area_temp = np.ma.masked_array(area_temp, mask = topo.mask) area_temp = np.ma.masked_less_equal(area_temp, threshold) osi_area.append(calc_extent(np.ma.filled(area_temp,0),\ dxcXdyc, lat, blat=boundLat, threshold=threshold)/10e11) diff_array[nnum,:] = area_model[0,:]-osi_area[:] return diff_array
def __init__(self, path="", vname="", characteristic_scale_deg=0.01, chunks=(5, 500, 500)): self.chunks = chunks try: self.__ds = Dataset(path) self.data = da.from_array(Dataset(path).variables[vname], self.chunks, lock=True) except OSError as err: import glob if isinstance(path, str): path_list = glob.glob(path) else: path_list = path path_list = sorted(path_list) self.data = [ da.from_array(Dataset(p).variables[vname], self.chunks, lock=True) for p in path_list ] self.data = da.concatenate(self.data) try: self.__ds = MFDataset(path_list) except ValueError as verr: print( "Warning: Could not use MFDataset from netCDF4, trying xarray" ) self.__ds = xarray.concat([ xarray.open_dataset(p, chunks={"time": 100}) for p in sorted(path_list) ], data_vars="minimal", dim="time") self.missing_value = None if hasattr(self.__ds.variables[vname], "missing_value"): self.missing_value = self.__ds.variables[vname].missing_value else: self.missing_value = np.nan self.vname = vname # # self.data = biggus.OrthoArrayAdapter(self.ds.variables[vname]) self.lons = None self.lats = None self.time = None self.time_to_index = None self.characteristic_scale_deg = characteristic_scale_deg self.__read_coordinates_and_time() self.__ds.close()
dates = num2date(times[:], units=times.units, calendar=times.calendar) print('dates corresponding to time values:\\n', dates) rootgrp.close() # create a series of netCDF files with a variable sharing # the same unlimited dimension. for nfile in range(10): f = Dataset('mftest' + repr(nfile) + '.nc', 'w', format='NETCDF4_CLASSIC') f.createDimension('x', None) x = f.createVariable('x', 'i', ('x', )) x[0:10] = numpy.arange(nfile * 10, 10 * (nfile + 1)) f.close() # now read all those files in at once, in one Dataset. from netCDF4 import MFDataset f = MFDataset('mftest*nc') print(f.variables['x'][:]) # example showing how to save numpy complex arrays using compound types. f = Dataset('complex.nc', 'w') size = 3 # length of 1-d complex array # create sample complex data. datac = numpy.exp(1j * (1. + numpy.linspace(0, numpy.pi, size))) print(datac.dtype) # create complex128 compound data type. complex128 = numpy.dtype([('real', numpy.float64), ('imag', numpy.float64)]) complex128_t = f.createCompoundType(complex128, 'complex128') # create a variable with this data type, write some data to it. f.createDimension('x_dim', None) v = f.createVariable('cmplx_var', complex128_t, 'x_dim') data = numpy.empty(size, complex128) # numpy structured array
compressedFile.write(ftpf.read()) compressedFile.flush() else: if args.verbose > 0: print('using cache ' + cachedpath) compressedFile = open(cachedpath, 'r+b') compressedFile.seek(0) decompressedFile = gzip.GzipFile(fileobj=compressedFile, mode='rb') diskf = tempfile.NamedTemporaryFile('w+b') infiles.append(diskf) diskf.write(decompressedFile.read()) diskf.flush() if args.verbose > 0: print(time, 'end') ncff = MFDataset([inf.name for inf in infiles], 'r') lat = ncff.variables['latitude'][:] lon = ncff.variables['longitude'][:] points = zip(lon, lat) found_point_ids = [] for pi, point in enumerate(points): isin = prep_bounds.contains(Point(*point)) if isin: found_point_ids.append(pi) if args.verbose > 1: print(point, isin) elif args.verbose > 2: print(point, isin) varkeys = ['temperature', 'windDir', 'windSpeed', 'dewpoint', 'altimeter'] vardds = [k + 'DD' for k in varkeys]
def __init__(self, filename=None, name=None): if filename is None: raise ValueError('Need filename as argument to constructor') filestr = str(filename) if name is None: self.name = filestr else: self.name = name # Due to misspelled standard_name in # some (Akvaplan-NIVA) FVCOM files variable_aliases = { 'eastward_sea_water_velocity': 'x_sea_water_velocity', 'Northward_sea_water_velocity': 'y_sea_water_velocity', 'eastward wind': 'x_wind', 'northward wind': 'y_wind' } # Mapping FVCOM variable names to CF standard_name fvcom_mapping = { 'um': 'x_sea_water_velocity', 'vm': 'y_sea_water_velocity' } self.return_block = True try: # Open file, check that everything is ok logging.info('Opening dataset: ' + filestr) if ('*' in filestr) or ('?' in filestr) or ('[' in filestr): logging.info('Opening files with MFDataset') self.Dataset = MFDataset(filename) else: logging.info('Opening file with Dataset') self.Dataset = Dataset(filename, 'r') except Exception as e: raise ValueError(e) # We are reading and using lon/lat arrays, # and not any projected coordinates self.proj4 = '+proj=latlong' logging.debug('Finding coordinate variables.') # Find x, y and z coordinates for var_name in self.Dataset.variables: var = self.Dataset.variables[var_name] if var.ndim > 1: continue # Coordinates must be 1D-array attributes = var.ncattrs() standard_name = '' long_name = '' axis = '' units = '' CoordinateAxisType = '' if 'standard_name' in attributes: standard_name = var.__dict__['standard_name'] if 'long_name' in attributes: long_name = var.__dict__['long_name'] if 'axis' in attributes: axis = var.__dict__['axis'] if 'units' in attributes: units = var.__dict__['units'] if '_CoordinateAxisType' in attributes: CoordinateAxisType = var.__dict__['_CoordinateAxisType'] if standard_name == 'longitude' or \ long_name == 'longitude' or \ var_name == 'longitude' or \ axis == 'X' or \ CoordinateAxisType == 'Lon' or \ standard_name == 'projection_x_coordinate': self.xname = var_name # Fix for units; should ideally use udunits package if units == 'km': unitfactor = 1000 else: unitfactor = 1 x = var[:] * unitfactor self.unitfactor = unitfactor self.numx = var.shape[0] if standard_name == 'latitude' or \ long_name == 'latitude' or \ var_name == 'latitude' or \ axis == 'Y' or \ CoordinateAxisType == 'Lat' or \ standard_name == 'projection_y_coordinate': self.yname = var_name # Fix for units; should ideally use udunits package if units == 'km': unitfactor = 1000 else: unitfactor = 1 y = var[:] * unitfactor self.numy = var.shape[0] if standard_name == 'depth' or axis == 'Z': if 'positive' not in var.ncattrs() or \ var.__dict__['positive'] == 'up': self.z = var[:] else: self.z = -var[:] if standard_name == 'time' or axis == 'T' or var_name == 'time': # Read and store time coverage (of this particular file) time = var[:] time_units = units self.times = num2date(time, time_units) self.start_time = self.times[0] self.end_time = self.times[-1] if len(self.times) > 1: self.time_step = self.times[1] - self.times[0] else: self.time_step = None if 'x' not in locals(): raise ValueError('Did not find x-coordinate variable') if 'y' not in locals(): raise ValueError('Did not find y-coordinate variable') self.lon = x self.lat = y # Find all variables having standard_name self.variable_mapping = {} for var_name in self.Dataset.variables: if var_name in [self.xname, self.yname, 'depth']: continue # Skip coordinate variables var = self.Dataset.variables[var_name] attributes = var.ncattrs() if 'standard_name' in attributes: standard_name = str(var.__dict__['standard_name']) if standard_name in variable_aliases: # Mapping if needed standard_name = variable_aliases[standard_name] self.variable_mapping[standard_name] = str(var_name) elif var_name in fvcom_mapping: self.variable_mapping[fvcom_mapping[var_name]] = \ str(var_name) self.variables = self.variable_mapping.keys() self.xmin = self.lon.min() self.xmax = self.lon.max() self.ymin = self.lat.min() self.ymax = self.lat.max() # Run constructor of parent Reader class super(Reader, self).__init__()
def _read_tcoord(self): """ Read time coordinate information from netcdf file(s) """ nc = MFDataset(self.f) t = nc.variables[self.tcoord] self.dates = num2date(MFTime(t)[:], calendar=t.calendar, units=t.units)
keeptmax = False if options.tx90pc or options.tx90pcd: keeptmax = True keeptmin = False if options.tn90pc or options.tn90pcd: keeptmin = True keeptave = True if options.noehf: keeptave = False if options.verbose: print "Loading data" # Load time data try: tmaxnc = MFDataset(options.tmaxfile, "r") except IndexError: tmaxnc = Dataset(options.tmaxfile, "r") nctime = tmaxnc.variables[options.timevname] try: nctime = MFTime(nctime) except AttributeError: pass except ValueError: pass calendar = nctime.calendar if not calendar: print "Unrecognized calendar. Using gregorian." calendar = "gregorian" elif calendar == "360_day": daysinyear = 360
class CRUDataManager: def __init__(self, path="/RECH/skynet1_rech3/huziy/cru_data/CRUTS3.1/cru_ts_3_10.1901.2009.tmp.dat.nc", var_name="tmp", lazy=False): self.times = None self.var_data = None self.times_var = None self.kdtree = None self.times_num = None self.lons2d, self.lats2d = None, None self.lazy = lazy self.var_name = var_name try: with Dataset(path) as ds: self._init_fields(ds) # Cannot go into with, since it needs to be open self.nc_dataset = Dataset(path) except OSError as oserr: with MFDataset(path) as ds: self._init_fields(ds) # Cannot go into with, since it needs to be open self.nc_dataset = MFDataset(path) self.nc_vars = ds.variables def close(self): self.nc_vars = None self.nc_dataset.close() del self def _init_fields(self, nc_dataset): nc_vars = nc_dataset.variables lons = nc_vars["lon"][:] lats = nc_vars["lat"][:] if lons.ndim == 1: lats2d, lons2d = np.meshgrid(lats, lons) elif lons.ndim == 2: lats2d, lons2d = lats, lons else: raise NotImplementedError("Cannot handle {}-dimensional coordinates".format(lons.ndim)) self.lons2d, self.lats2d = lons2d, lats2d self.times_var = nc_vars["time"] self.times_num = nc_vars["time"][:] if hasattr(self.times_var, "calendar"): self.times = num2date(self.times_num, self.times_var.units, self.times_var.calendar) else: self.times = num2date(self.times_num, self.times_var.units) if not self.lazy: self.var_data = nc_vars[self.var_name][:] if nc_vars[self.var_name].shape[1:] != self.lons2d.shape: print("nc_vars[self.var_name].shape = {}".format(nc_vars[self.var_name].shape)) self.var_data = np.transpose(self.var_data, axes=[0, 2, 1]) x_in, y_in, z_in = lat_lon.lon_lat_to_cartesian(self.lons2d.flatten(), self.lats2d.flatten()) self.kdtree = cKDTree(list(zip(x_in, y_in, z_in))) def get_seasonal_means_with_ttest_stats_interp_to(self, lons2d=None, lats2d=None, season_to_monthperiod=None, start_year=None, end_year=None): #TODO: implement pass def get_seasonal_means_with_ttest_stats(self, season_to_monthperiod=None, start_year=None, end_year=None): """ Note: the periods of different seasons should not overlap. precip are converted to mm/day before the mean and std calculations :param season_to_monthperiod: :param start_year: :param end_year: :return dict(season: [mean, std, nobs]) """ nt, nx, ny = self.var_data.shape panel = pandas.DataFrame(data=self.var_data.reshape(nt, -1), index=self.times) panel = panel[(panel.index.year >= start_year) & (panel.index.year <= end_year)] # Calculate monthly means, convert precip to mm/day if self.var_name.lower() in ["pre"]: monthly_panel = panel.groupby([panel.index.year, panel.index.month]).sum() monthly_panel = monthly_panel / monthly_panel.index.map(lambda ym: calendar.monthrange(*ym)[1])[:, np.newaxis] else: monthly_panel = panel.groupby([panel.index.year, panel.index.month]).mean() print("monthly panel:") print(monthly_panel.describe()) season_to_res = OrderedDict() for season, month_period in season_to_monthperiod.items(): assert isinstance(month_period, MonthPeriod) print("{} ------- (months: {}) ".format(season, month_period.months)) ym_to_period = month_period.get_year_month_to_period_map(start_year=start_year, end_year=end_year) # print(ym_to_period) # select data for the seasons of interest monthly_panel_tmp = monthly_panel.select(lambda ym: (ym[1] in month_period.months) and (ym in ym_to_period)) # print("monthly_panel_tmp, afterselect: {}".format(monthly_panel_tmp)) days_per_month = monthly_panel_tmp.index.map(lambda ym: calendar.monthrange(*ym)[1]) monthly_panel_tmp = monthly_panel_tmp * days_per_month[:, np.newaxis] seasonal_groups = monthly_panel_tmp.groupby(lambda ym: (ym_to_period[ym].start, ym_to_period[ym].end)) nobs = len(seasonal_groups) seasonal_means = [] days_per_season = [] for kv, gv in seasonal_groups: # print(kv, "---->", gv) # calculate seasonal mean for each year ndays = (Pendulum.instance(kv[1]).add(microseconds=1) - Pendulum.instance(kv[0])).total_days() # because the end of each period is 1 microsecond before midnight seas_mean = gv.sum(axis=0) / ndays seasonal_means.append(seas_mean.values) days_per_season.append(ndays) seasonal_means = np.array(seasonal_means) days_per_season = np.array(days_per_season) # calculate climatological mean clim_mean = (seasonal_means * days_per_season[:, np.newaxis]).sum(axis=0) / days_per_season.sum() # calculate interannual std clim_std = (((seasonal_means - clim_mean) ** 2 * days_per_season[:, np.newaxis]).sum(axis=0) / days_per_season.sum()) ** 0.5 # reshape back to the 2d field clim_mean = clim_mean.reshape(nx, ny) clim_std = clim_std.reshape(nx, ny) spatial_mask = (self.var_data[0] > 1e10) | np.isnan(self.var_data[0]) if hasattr(self.var_data, "mask"): spatial_mask = spatial_mask | self.var_data[0].mask clim_mean = np.ma.masked_where(spatial_mask, clim_mean) clim_std = np.ma.masked_where(spatial_mask, clim_std) print(season) print("clim_mean.shape={}".format(clim_mean.shape)) print("clim_std.shape={}".format(clim_std.shape)) season_to_res[season] = [clim_mean, clim_std, nobs] return season_to_res def get_seasonal_means(self, season_name_to_months=None, start_year=None, end_year=None): if season_name_to_months is None: season_name_to_months = OrderedDict([ ("Winter", (1, 2, 12)), ("Spring", list(range(3, 6))), ("Summer", list(range(6, 9))), ("Fall", list(range(9, 12)))]) season_name_to_coef = {} for sname, months in season_name_to_months.items(): season_name_to_coef[sname] = 1 if self.var_name.lower() in ["pre", "precip"]: days = sum([calendar.monthrange(y, m)[1] for m in months for y in range(start_year, end_year + 1)]) season_name_to_coef[sname] = 1.0 / float(days) month_to_season = collections.defaultdict(lambda: "no_season") for sname, mlist in season_name_to_months.items(): for m in mlist: month_to_season[m] = sname if self.var_data is None: self.var_data = self.nc_dataset.variables[self.var_name][:] if self.var_name.lower() not in ["swe"]: if self.var_data.shape != self.lons2d.shape: self.var_data = np.transpose(self.var_data, axes=[0, 2, 1]) nt, nx, ny = self.var_data.shape panel = pandas.Panel(data=self.var_data, items=self.times, major_axis=list(range(nx)), minor_axis=list(range(ny))) panel = panel.select(lambda d: start_year <= d.year <= end_year) if self.var_name in ["pre", "precip"]: panel_seasonal = panel.groupby(lambda d: month_to_season[d.month], axis="items").sum() else: panel_seasonal = panel.groupby(lambda d: month_to_season[d.month], axis="items").mean() season_to_mean = OrderedDict() for sname, _ in season_name_to_months.items(): season_to_mean[sname] = panel_seasonal[sname].values * season_name_to_coef[sname] if hasattr(self.var_data[0], "mask"): season_to_mean[sname] = np.ma.masked_where(self.var_data[0].mask, season_to_mean[sname]) return season_to_mean def get_mean(self, start_year, end_year, months=None): """ returns the mean for the period [start_year, end_year], over the months :type months: list months = list of month numbers over which the averaging is done """ if months is None: months = list(range(1, 13)) start_date = datetime(start_year, 1, 1) end_date = datetime(end_year + 1, 1, 1) start_date_num = date2num(start_date, self.times_var.units) end_date_num = date2num(end_date, self.times_var.units) sel_query = (self.times_num >= start_date_num) & (self.times_num < end_date_num) sel_dates = self.times_num[sel_query] sel_data = np.transpose(self.nc_vars[self.var_name][sel_query, :, :], axes=[0, 2, 1]) sel_dates = num2date(sel_dates, self.times_var.units) ind_vector = np.where([(x.month in months) for x in sel_dates])[0] return np.mean(sel_data[ind_vector, :, :], axis=0) def get_daily_climatology_dataframe(self, start_year, end_year, stamp_year=2001): """ returns a pandas dataframe (365, nx, ny) with daily climatological means """ nt, nx, ny = self.var_data.shape data_panel = pandas.Panel(data=self.var_data, items=self.times, major_axis=list(range(nx)), minor_axis=list(range(ny))) data_panel = data_panel.select( lambda d: (start_year <= d.year <= end_year) and not (d.day == 29 and d.month == 2)) data_panel = data_panel.groupby(lambda d: datetime(stamp_year, d.month, d.day), axis="items").mean() assert isinstance(data_panel, pandas.Panel) data_panel = data_panel.sort_index() print(data_panel.values.shape) return data_panel def get_daily_climatology(self, start_year, end_year, stamp_year=2001): """ returns a numpy array of shape (365, nx, ny) with daily climatological means """ return self.get_daily_climatology_dataframe(**locals()).values def interpolate_daily_climatology_to(self, clim_data, lons2d_target=None, lats2d_target=None): # expects clim_data to have the following shape (365, nx, ny) # lons2d_target: (nx, ny) # lats2d_target: (nx, ny) x, y, z = lat_lon.lon_lat_to_cartesian(lons2d_target.flatten(), lats2d_target.flatten()) nt = clim_data.shape[0] data_help = np.reshape(clim_data, (nt, -1)) dists, inds = self.kdtree.query(list(zip(x, y, z))) return data_help[:, inds].reshape((nt,) + lons2d_target.shape) pass def get_thawing_index_from_climatology(self, daily_temps_clim, t0=0.0): nt, nx, ny = daily_temps_clim.shape result = np.zeros((nx, ny)) for t in range(nt): tfield = daily_temps_clim[t, :, :] result += tfield * np.array(tfield >= t0).astype(int) return result def create_monthly_means_file(self, start_year, end_year): fname = "{0}_monthly_means.nc".format(self.var_name) year_range = list(range(start_year, end_year + 1)) dsm = Dataset(fname, "w", format="NETCDF3_CLASSIC") dsm.createDimension('year', len(year_range)) dsm.createDimension("month", 12) dsm.createDimension('lon', self.lons2d.shape[0]) dsm.createDimension('lat', self.lons2d.shape[1]) lonVariable = dsm.createVariable('longitude', 'f4', ('lon', 'lat')) latVariable = dsm.createVariable('latitude', 'f4', ('lon', 'lat')) yearVariable = dsm.createVariable("year", "i4", ("year",)) variable = dsm.createVariable(self.var_name, "f4", ('year', "month", 'lon', 'lat')) for i, the_year in enumerate(year_range): print(the_year) for j, the_month in enumerate(range(1, 13)): variable[i, j, :, :] = self.get_mean(the_year, the_year, months=[the_month]) lonVariable[:] = self.lons2d latVariable[:] = self.lats2d yearVariable[:] = np.array(year_range) dsm.close() pass def _interp_and_sum(self, data1d, mults_1d, x, y, z, nneighbors=1): data_interp = self.interpolate_data_to_cartesian(data1d, x, y, z, nneighbours=nneighbors) return np.sum(mults_1d * data_interp) def get_monthly_timeseries_using_mask(self, mask, lons2d_target, lats2d_target, multipliers_2d, start_date=None, end_date=None): """ multipliers_2d used to multiply the values when aggregating into a single timeseries sum(mi * vi) - in space """ bool_vect = np.array([start_date <= t <= end_date for t in self.times]) new_times = list(filter(lambda t: start_date <= t <= end_date, self.times)) new_vals = self.var_data[bool_vect, :, :] x_out, y_out, z_out = lat_lon.lon_lat_to_cartesian(lons2d_target.flatten(), lats2d_target.flatten()) print(len(new_times)) flat_mask = mask.flatten() x_out = x_out[flat_mask == 1] y_out = y_out[flat_mask == 1] z_out = z_out[flat_mask == 1] mults = multipliers_2d.flatten()[flat_mask == 1] data_interp = [self._interp_and_sum(new_vals[t, :, :].flatten(), mults, x_out, y_out, z_out) for t in range(len(new_times))] print("Interpolated data", data_interp) print("Interpolated all") return TimeSeries(time=new_times, data=data_interp).get_ts_of_monthly_means() def get_mean_upstream_timeseries_monthly(self, model_point, data_manager): """ get mean swe upstream of the model_point year range for selection is in model_point.continuous_data_years() .. """ assert isinstance(model_point, ModelPoint) assert isinstance(data_manager, Crcm5ModelDataManager) # create the mask of points over which the averaging is going to be done lons_targ = data_manager.lons2D[model_point.flow_in_mask == 1] lats_targ = data_manager.lats2D[model_point.flow_in_mask == 1] xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_targ, lats_targ) nxs, nys = self.lons2d.shape i_source, j_source = list(range(nxs)), list(range(nys)) j_source, i_source = np.meshgrid(j_source, i_source) i_source = i_source.flatten() j_source = j_source.flatten() dists, inds = self.kdtree.query(list(zip(xt, yt, zt)), k=1) ixsel = i_source[inds] jysel = j_source[inds] print("Calculating spatial mean") #calculate spatial mean #calculate spatial mean if self.lazy: theVar = self.nc_vars[self.var_name] data_series = [] for i, j in zip(ixsel, jysel): data_series.append(theVar[:, j, i]) data_series = np.mean(data_series, axis=0) else: data_series = np.mean(self.var_data[:, ixsel, jysel], axis=1) print("Finished calculating spatial mean") #calculate daily climatology df = pandas.DataFrame(data=data_series, index=self.times, columns=["values"]) df["year"] = df.index.map(lambda d: d.year) df = df[df["year"].isin(model_point.continuous_data_years)] monthly_clim = df.groupby(by=lambda d: d.month).mean() month_dates = [datetime(1985, m, 15) for m in range(1, 13)] vals = [monthly_clim.ix[d.month, "values"] for d in month_dates] return pandas.TimeSeries(data=vals, index=month_dates) def get_mean_upstream_timeseries_daily(self, model_point, dm, stamp_dates=None): """ get mean swe upstream of the model_point """ assert isinstance(model_point, ModelPoint) assert isinstance(dm, Crcm5ModelDataManager) # create the mask of points over which the averaging is going to be done lons_targ = dm.lons2D[model_point.flow_in_mask == 1] lats_targ = dm.lats2D[model_point.flow_in_mask == 1] xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_targ, lats_targ) nxs, nys = self.lons2d.shape i_source, j_source = list(range(nxs)), list(range(nys)) j_source, i_source = np.meshgrid(j_source, i_source) i_source = i_source.flatten() j_source = j_source.flatten() dists, inds = self.kdtree.query(list(zip(xt, yt, zt)), k=1) ixsel = i_source[inds] jysel = j_source[inds] df_empty = pandas.DataFrame(index=self.times) df_empty["year"] = df_empty.index.map(lambda d: d.year) # calculate spatial mean sel_date_indices = np.where(df_empty["year"].isin(model_point.continuous_data_years))[0] if self.lazy: the_var = self.nc_vars[self.var_name] data_series = np.mean([the_var[sel_date_indices, j, i] for i, j in zip(ixsel, jysel)], axis=0) else: data_series = np.mean(self.var_data[:, ixsel, jysel], axis=1) # calculate daily climatology df = pandas.DataFrame(data=data_series, index=self.times, columns=["values"]) df["year"] = df.index.map(lambda d: d.year) df = df[df["year"].isin(model_point.continuous_data_years)] daily_clim = df.groupby(by=lambda d: (d.month, d.day)).mean() vals = [daily_clim.ix[(d.month, d.day), "values"] for d in stamp_dates] return pandas.TimeSeries(data=vals, index=stamp_dates) def get_daily_timeseries_using_mask(self, mask, lons2d_target, lats2d_target, multipliers_2d, start_date=None, end_date=None): """ multipliers_2d used to multiply the values when aggregating into a single timeseries sum(mi * vi) - in space """ bool_vect = np.array([start_date <= t <= end_date for t in self.times]) new_times = list(filter(lambda t: start_date <= t <= end_date, self.times)) new_vals = self.var_data[bool_vect, :, :] x_out, y_out, z_out = lat_lon.lon_lat_to_cartesian(lons2d_target.flatten(), lats2d_target.flatten()) print(len(new_times)) flat_mask = mask.flatten() x_out = x_out[flat_mask == 1] y_out = y_out[flat_mask == 1] z_out = z_out[flat_mask == 1] mults = multipliers_2d.flatten()[flat_mask == 1] data_interp = [self._interp_and_sum(new_vals[t, :, :].flatten(), flat_mask, x_out, y_out, z_out) for t in range(len(new_times))] print("Interpolated all") return TimeSeries(time=new_times, data=data_interp).get_ts_of_daily_means() def interpolate_data_to_cartesian(self, data_in_flat, x, y, z, nneighbours=4): """ len(data_in_flat) , len(x) == len(y) == len(z) == len(data_out_flat) - all 1D """ print("start query") dst, ind = self.kdtree.query(list(zip(x, y, z)), k=nneighbours) print("end query") inverse_square = 1.0 / dst ** 2 if len(dst.shape) > 1: norm = np.sum(inverse_square, axis=1) norm = np.array([norm] * dst.shape[1]).transpose() coefs = inverse_square / norm data_out_flat = np.sum(coefs * data_in_flat[ind], axis=1) elif len(dst.shape) == 1: data_out_flat = data_in_flat[ind] else: raise Exception("Could not find neighbor points") return data_out_flat def interpolate_data_to(self, data_in, lons2d, lats2d, nneighbours=4): """ Interpolates data_in to the grid defined by (lons2d, lats2d) assuming that the data_in field is on the initial CRU grid interpolate using 4 nearest neighbors and inverse of squared distance """ x_out, y_out, z_out = lat_lon.lon_lat_to_cartesian(lons2d.flatten(), lats2d.flatten()) dst, ind = self.kdtree.query(list(zip(x_out, y_out, z_out)), k=nneighbours) data_in_flat = data_in.flatten() inverse_square = 1.0 / dst ** 2 if len(dst.shape) > 1: norm = np.sum(inverse_square, axis=1) norm = np.array([norm] * dst.shape[1]).transpose() coefs = inverse_square / norm data_out_flat = np.sum(coefs * data_in_flat[ind], axis=1) elif len(dst.shape) == 1: data_out_flat = data_in_flat[ind] else: raise Exception("Could not find neighbor points") return np.reshape(data_out_flat, lons2d.shape)
def runTest(self): """testing multi-file dataset access""" f = MFDataset(self.files, check=True) f.set_auto_maskandscale(True) # issue570 assert f.history == 'created today' assert_array_equal(np.arange(0, nx), f.variables['x'][:]) varin = f.variables['data'] datin = varin[:] assert_array_equal(datin.mask, data.mask) varin.set_auto_maskandscale(False) data2 = data.filled() assert varin.long_name == 'phony data' assert len(varin) == nx assert varin.shape == (nx, ydim, zdim) assert varin.dimensions == ('x', 'y', 'z') assert_array_equal(varin[4:-4:4, 3:5, 2:8], data2[4:-4:4, 3:5, 2:8]) assert varin[0, 0, 0] == data2[0, 0, 0] assert_array_equal(varin[:], data2) assert getattr(varin, 'nonexistantatt', None) == None f.close() # test master_file kwarg (issue #835). f = MFDataset(self.files, master_file=self.files[-1], check=True) assert_array_equal(np.arange(0, nx), f.variables['x'][:]) varin = f.variables['data'] assert_array_equal(varin[4:-4:4, 3:5, 2:8], data2[4:-4:4, 3:5, 2:8]) f.close() # testing multi-file get_variables_by_attributes f = MFDataset(self.files, check=True) assert f.get_variables_by_attributes(axis='T') == [] f.get_variables_by_attributes(units='zlotys')[0] == f['x'] f.close()
def readEnsemble(wrfinit, timerange=None, fields=None, debug=False): ''' Reads in desired fields and returns 2-D arrays of data for each field (barb/contour/field) ''' if debug: print fields datadict = {} file_list, missing_list = makeEnsembleList( wrfinit, timerange) #construct list of files # loop through fill field, contour field, barb field and retrieve required data for f in ['fill', 'contour', 'barb']: if not fields[f].keys(): continue if debug: print 'Reading field:', fields[f]['name'], 'from', fields[f][ 'filename'] # save some variables for use in this function filename = fields[f]['filename'] arrays = fields[f]['arrayname'] fieldtype = fields[f]['ensprod'] fieldname = fields[f]['name'] if fieldtype in ['prob', 'neprob']: thresh = fields[f]['thresh'] if fieldtype[0:3] == 'mem': member = int(fieldtype[3:]) # open Multi-file netcdf dataset if debug: print file_list[filename] fh = MFDataset(file_list[filename]) # loop through each field, wind fields will have two fields that need to be read datalist = [] for n, array in enumerate(arrays): if debug: print 'Reading', array #read in 3D array (times*members,ny,nx) from file object if 'arraylevel' in fields[f]: if isinstance(fields[f]['arraylevel'], list): level = fields[f]['arraylevel'][n] else: level = fields[f]['arraylevel'] else: level = None if level == 'max': data = np.amax(fh.variables[array][:, :, :, :], axis=1) elif level is None: data = fh.variables[array][:, :, :] else: data = fh.variables[array][:, level, :, :] # change units for certain fields if array in [ 'U_PL', 'V_PL', 'UBSHR6', 'VBSHR6', 'U10', 'V10', 'U_COMP_STM', 'V_COMP_STM', 'S_PL' ]: data = data * 1.93 # m/s > kt elif array in ['DEWPOINT_2M', 'T2', 'AFWA_WCHILL', 'AFWA_HEATIDX']: data = (data - 273.15) * 1.8 + 32.0 # K > F elif array in [ 'PREC_ACC_NC', 'PREC_ACC_C', 'AFWA_PWAT', 'PWAT', 'AFWA_SNOWFALL', 'AFWA_SNOW', 'AFWA_ICE', 'AFWA_FZRA' ]: data = data * 0.0393701 # mm > in #hcl elif array in ['AFWA_PWAT', 'PWAT', 'AFWA_SNOWFALL', 'AFWA_SNOW', 'AFWA_ICE', 'AFWA_FZRA']: data = data*0.0393701 # mm > in elif array in [ 'RAINNC', 'RAINC', 'GRPL_MAX', 'SNOW_ACC_NC', 'AFWA_HAIL' ]: data = data * 0.0393701 # mm > in elif array in ['T_PL', 'TD_PL', 'SFC_LI']: data = data - 273.15 # K > C elif array in ['AFWA_MSLP', 'MSLP']: data = data * 0.01 # Pa > hPa elif array in ['ECHOTOP']: data = data * 3.28084 # m > ft elif array in ['AFWA_VIS']: data = (data * 0.001) / 1.61 # m > mi elif array in ['SBCINH', 'MLCINH', 'W_DN_MAX']: data = data * -1.0 # make cin positive elif array in ['PVORT_320K']: data = data * 1000000 # multiply by 1e6 elif array in ['SBT123_GDS3_NTAT', 'SBT124_GDS3_NTAT']: data = data - 273.15 # K -> C elif array in ['HAIL_MAXK1', 'HAIL_MAX2D']: data = data * 39.3701 # m -> inches elif array in ['PBMIN', 'PBMIN_SFC']: data = data * 0.01 # Pa -> hPa # elif array in ['LTG1_MAX1', 'LTG2_MAX', 'LTG3_MAX']: data = data*0.20 # scale down excess values datalist.append(data) # these are derived fields, we don't have in any of the input files but we can compute if 'name' in fields[f]: if fieldname in ['shr06mag', 'shr01mag', 'bunkmag', 'speed10m']: datalist = [np.sqrt(datalist[0]**2 + datalist[1]**2)] elif fieldname == 'stp': datalist = [computestp(datalist)] # GSR in fields are T(K), mixing ratio (kg/kg), and surface pressure (Pa) elif fieldname == 'thetae': datalist = [compute_thetae(datalist)] elif fieldname == 'pbmin': datalist = [datalist[1] - datalist[0][:, 0, :]] #elif fieldname in ['precip', 'precipacc']: datalist = [ datalist[0]+datalist[1] ] datadict[f] = [] for data in datalist: # perform mean/max/variance/etc to reduce 3D array to 2D if (fieldtype == 'mean'): data = np.mean(data, axis=0) elif (fieldtype == 'pmm'): data = compute_pmm(data) elif (fieldtype == 'max'): data = np.amax(data, axis=0) elif (fieldtype == 'var'): data = np.std(data, axis=0) elif (fieldtype == 'summean'): for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files #hcl data = np.reshape(data, (data.shape[0]/10,10,data.shape[1],data.shape[2])) data = np.nansum(data, axis=0) #hcl data = np.nanmean(data, axis=0) elif (fieldtype == 'summax'): for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape( data, (data.shape[0] / 10, 10, data.shape[1], data.shape[2])) data = np.nansum(data, axis=0) data = np.nanmax(data, axis=0) elif (fieldtype[0:3] == 'mem'): for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape( data, (data.shape[0] / 10, 10, data.shape[1], data.shape[2])) data = np.nanmax(data, axis=0) data = data[member - 1, :] elif (fieldtype in ['prob', 'neprob']): data = (data >= thresh).astype('float') for i in missing_list[filename]: data = np.insert(data, i, np.nan, axis=0) #insert nan for missing files data = np.reshape( data, (data.shape[0] / 10, 10, data.shape[1], data.shape[2])) data = np.nanmax(data, axis=0) if (fieldtype == 'neprob'): data = compute_neprob(data, roi=14, sigma=float(fields['sigma']), type='gaussian') else: data = np.nanmean(data, axis=0) data = data + 0.001 #hack to ensure that plot displays discrete prob values if debug: print 'field', fieldname, 'has shape', data.shape, 'max', data.max( ), 'min', data.min() # attach data arrays for each type of field (e.g. { 'fill':[data], 'barb':[data,data] }) datadict[f].append(data) fh.close() return (datadict, missing_list)
print '\n +++ LENDO OS DADOS +++' pcpaccaux = np.zeros((30, 64, 128)) pcpaccaux[:] = np.nan for i, ano in enumerate(range(1982, 2012)): # print ano nc1 = 'nc/pcp-daily-echam46-amip-{0}01.nc'.format(ano) nc2 = 'nc/pcp-daily-echam46-amip-{0}02.nc'.format(ano) nc3 = 'nc/pcp-daily-echam46-amip-{0}03.nc'.format(ano) netcdfs = [nc1, nc2, nc3] data = MFDataset(netcdfs) pcp = data.variables['pcp'][:] lons_360 = data.variables['longitude'][:] lats = data.variables['latitude'][:] data.close() a = np.nansum(pcp, axis=0) pcpaccaux[i, :, :] = np.nansum(pcp, axis=0) print pcpaccaux.shape pcpacc, lons = shiftgrid(180., pcpaccaux, lons_360, start=False) print '\n +++ INTERPOLACAO +++' newlats = np.linspace(-90, 90, 181)
for fhr in range(0, 49): print 'forecast hour', fhr if fhr in fhdone: fha.write(str(fhr) + "\n") continue RUN_DIR = '/glade/scratch/hclin/CONUS/wrfda/postdir/soundings' files = [] sound = '%s/%s/sound_wrfda_Fhr_%03d.nc' % (RUN_DIR, yyyymmddhh, fhr) print sound if os.path.exists(sound): files.append(sound) if len(files) < 1: continue print time.ctime(time.time()), ':', 'Reading data' numens = len(files) fh = MFDataset(files) numstations = len(fh.dimensions['stations']) numlevels = len(fh.dimensions['bottom_top']) tmpc = fh.variables['TEMP_MODLEV'][:].reshape( (numens, numlevels, numstations)) dwpc = fh.variables['DEWPOINT_MODLEV'][:].reshape( (numens, numlevels, numstations)) hght = fh.variables['HEIGHT_MODLEV'][:].reshape( (numens, numlevels, numstations)) pres = fh.variables['PRESSURE_MODLEV'][:].reshape( (numens, numlevels, numstations)) ugrd = fh.variables['U_GRID_MODLEV'][:].reshape( (numens, numlevels, numstations)) vgrd = fh.variables['V_GRID_MODLEV'][:].reshape( (numens, numlevels, numstations)) stns = chartostring(fh.variables['stn'][:, 0:3])
def smart_reader(fNcdf,var_list,suppress_warning=False): """ Smarter alternative to using var=fNcdf.variables['var'][:] when handling PROCESSED files that also check matching XXXXX.atmos_average.nc (or daily...) and XXXXX.fixed.nc files Args: fNcdf: Netcdf file object (i.e. already opened with Dataset or MFDataset) var_list: variable or list of variables, e.g 'areo' or ['pk','bk','areo'] suppress_warning: Suppress debug statement, useful if variable is not expected to be found in the file anyway Returns: out_list: variables content as singleton or values to unpack ------- Example: from netCDF4 import Dataset fNcdf=Dataset('/u/akling/FV3/00668.atmos_average_pstd.nc','r') ucomp= fNcdf.variables['ucomp'][:] # << this is the regular way vcomp= smart_reader(fNcdf,'vcomp') # << this is exacly equivalent pk,bk,areo= smart_reader(fNcdf,['pk','bk','areo']) # this will get 'areo' from 00668.atmos_average.nc is not available in the original _pstd.nc file # if pk and bk are absent from 0668.atmos_average.nc, it will also check 00668.fixed.n *** NOTE *** -Only the variables' content is returned, not the attributes """ #This out_list is for the variable out_list=[] one_element=False file_is_MF=False Ncdf_path= get_Ncdf_path(fNcdf) #Return string (Dataset) or list (MFDataset) if type(Ncdf_path)==list:file_is_MF=True #For generality convert to list if only one variable is provided, e.g 'areo'>['areo'] if type(var_list)==str: one_element=True var_list=[var_list] for ivar in var_list: #First try to read in the original file if ivar in fNcdf.variables.keys(): out_list.append(fNcdf.variables[ivar][:]) else: full_path_try=alt_FV3path(Ncdf_path,alt='raw',test_exist=True) if file_is_MF: f_tmp=MFDataset(full_path_try,'r') else: f_tmp=Dataset(full_path_try,'r') if ivar in f_tmp.variables.keys(): out_list.append(f_tmp.variables[ivar][:]) if not suppress_warning: print('**Warning*** Using variable %s in %s instead of original file(s)'%(ivar,full_path_try)) f_tmp.close() else: f_tmp.close() full_path_try=alt_FV3path(Ncdf_path,alt='fixed',test_exist=True) if file_is_MF:full_path_try=full_path_try[0] f_tmp=Dataset(full_path_try,'r') if ivar in f_tmp.variables.keys(): out_list.append(f_tmp.variables[ivar][:]) f_tmp.close() if not suppress_warning: print('**Warning*** Using variable %s in %s instead of original file(s)'%(ivar,full_path_try)) else: print('***ERROR*** Variable %s not found in %s, NOR in raw output or fixed file'%(ivar,full_path_try)) print(' >>> Assigning %s to NaN'%(ivar)) f_tmp.close() out_list.append(np.NaN) if one_element:out_list=out_list[0] return out_list
def merge_files_from_list(self, Ncfilename_list): Mf_IN = MFDataset(Ncfilename_list, 'r') self.copy_all_dims_from_Ncfile(Mf_IN) self.copy_all_vars_from_Ncfile(Mf_IN) Mf_IN.close()
@author: deborahkhider Opening a dataset contained in multiple netCDF files """ from netCDF4 import MFDataset # Just get a list of netCDF files. root = "/Volumes/Data HD/Documents/MINT/Climate/netCDFTutorial" files = ["Oct2010.nc", "Nov2010.nc", "Dec2010.nc"] file_names = [] for name in files: file_names.append(root + "/" + name) #Open the file and get the keys for this example nc_fid = MFDataset(file_names) keys = [] nc_vars = [var for var in nc_fid.variables] for vars in nc_vars: keys.append(getattr(nc_fid.variables[vars], 'long_name')) # First let's print out the file def MFncdump(nc_fid): """ MFncdump prints dimensions, variables and their attribute info Args: nc_fid: a netCDF file """
import os, time from netCDF4 import MFDataset, Dataset #,num2date,date2num import numpy as np import matplotlib.pyplot as plt import warnings test = False if test == True: files = [ "X:/ARCTIC2030/a20_avg_11705_arctic2030.nc", "X:/ARCTIC2030/a20_avg_11733_arctic2030.nc", "X:/ARCTIC2030/a20_avg_11761_arctic2030.nc" ] f = MFDataset(files) else: f = MFDataset("X:/ARCTIC2030/*.nc") latitude = np.array(f.variables['lat_rho']) longitude = np.array(f.variables['lon_rho']) # coordinates of needed station st_lon = 126.82 st_lat = 76.47 # real lat 76.77 # function 'def find_xi_eta' is based on # Model2roms Python toolbox # https://github.com/trondkr/model2roms
def ncread(file, vars=None, dims=False, noisy=False, atts=False, datetimes=False): """ Read in the FVCOM results file and spit out numpy arrays for each of the variables specified in the vars list. Optionally specify a dict with keys whose names match the dimension names in the netCDF file and whose values are strings specifying alternative ranges or lists of indices. For example, to extract the first hundred time steps, supply dims as: dims = {'time':'0:100'} To extract the first, 400th and 10,000th values of any array with nodes: dims = {'node':'[0, 3999, 9999]'} Any dimension not given in dims will be extracted in full. Specify atts=True to extract the variable attributes. Set datetimes=True to convert the FVCOM Modified Julian Day values to python datetime objects. Parameters ---------- file : str, list If a string, the full path to an FVCOM netCDF output file. If a list, a series of files to be loaded. Data will be concatenated into a single dict. vars : list, optional List of variable names to be extracted. If omitted, all variables are returned. dims : dict, optional Dict whose keys are dimensions and whose values are a string of either a range (e.g. {'time':'0:100'}) or a list of individual indices (e.g. {'time':'[0, 1, 80, 100]'}). Slicing is supported (::5 for every fifth value). noisy : bool, optional Set to True to enable verbose output. atts : bool, optional Set to True to enable output of the attributes (defaults to False). datetimes : bool, optional Set to True to convert FVCOM Modified Julian Days to Python datetime objects (creates a new `datetime' key in the output dict. Only applies if `vars' includes either the `Times' or `time' variables. Note: if FVCOM has been run with single precision output, then the conversion of the `time' values to a datetime object suffers rounding errors. It's best to either run FVCOM in double precision or specify only the `Times' data in the `vars' list. Returns ------- FVCOM : dict Dict of data extracted from the netCDF file. Keys are those given in vars and the data are stored as ndarrays. If `datetimes' is True, then this also includes a `datetime' key in which is the FVCOM Modified Julian Day time series converted to Python datetime objects. attributes : dict, optional If atts=True, returns the attributes as a dict for each variable in vars. The key `dims' contains the array dimensions (each variable contains the names of its dimensions) as well as the shape of the dimensions defined in the netCDF file. The key `global' contains the global attributes. See Also -------- read_probes : read in FVCOM ASCII probes output files. """ # Set to True when we've converted from Modified Julian Day so we don't # end up doing the conversion twice, once for `Times' and again for # `time' if both variables have been requested in `vars'. done_datetimes = False # Check whether we'll be able to fulfill the datetime request. if datetimes and vars and not list(set(vars) & set(('Times', 'time'))): raise ValueError("Conversion from Modified Julian Day to python " "datetimes has been requested but no time variable " "(`Times' or `time') has been requested in vars.") # If we have a list, assume it's lots of files and load them all. if isinstance(file, list): try: try: rootgrp = MFDataset(file, 'r') except IOError as msg: raise IOError('Unable to open file {} ({}). Aborting.'.format(file, msg)) except: # Try aggregating along a 'time' dimension (for POLCOMS, # for example). try: rootgrp = MFDataset(file, 'r', aggdim='time') except IOError as msg: raise IOError('Unable to open file {} ({}). Aborting.'.format(file, msg)) else: rootgrp = Dataset(file, 'r') # Create a dict of the dimension names and their current sizes read_dims = {} for key, var in list(rootgrp.dimensions.items()): # Make the dimensions ranges so we can use them to extract all the # values. read_dims[key] = '0:' + str(len(var)) # Compare the dimensions in the netCDF file with those provided. If we've # been given a dict of dimensions which differs from those in the netCDF # file, then use those. if dims: commonKeys = set(read_dims).intersection(list(dims.keys())) for k in commonKeys: read_dims[k] = dims[k] if noisy: print("File format: {}".format(rootgrp.file_format)) if not vars: vars = iter(list(rootgrp.variables.keys())) FVCOM = {} # Save the dimensions in the attributes dict. if atts: attributes = {} attributes['dims'] = read_dims attributes['global'] = {} for g in rootgrp.ncattrs(): attributes['global'][g] = getattr(rootgrp, g) for key, var in list(rootgrp.variables.items()): if noisy: print('Found ' + key, end=' ') sys.stdout.flush() if key in vars: vDims = rootgrp.variables[key].dimensions toExtract = [read_dims[d] for d in vDims] # If we have no dimensions, we must have only a single value, in # which case set the dimensions to empty and append the function to # extract the value. if not toExtract: toExtract = '.getValue()' # Thought I'd finally figured out how to replace the eval approach, # but I still can't get past the indexing needed to be able to # subset the data. # FVCOM[key] = rootgrp.variables.get(key)[0:-1] # I know, I know, eval() is evil. getData = 'rootgrp.variables[\'{}\']{}'.format(key, str(toExtract).replace('\'', '')) FVCOM[key] = eval(getData) # Add the units and dimensions for this variable to the list of # attributes. if atts: attributes[key] = {} try: attributes[key]['units'] = rootgrp.variables[key].units except: pass try: attributes[key]['dims'] = rootgrp.variables[key].dimensions except: pass if datetimes and key in ('Times', 'time') and not done_datetimes: # Convert the time data to datetime objects. How we do this # depends on which we hit first - `Times' or `time'. For the # former, we need to parse the strings, for the latter we can # leverage num2date from the netCDF4 module and use the time # units attribute. if key == 'Times': try: FVCOM['datetime'] = [datetime.strptime(''.join(i), '%Y-%m-%dT%H:%M:%S.%f') for i in FVCOM[key]] except ValueError: # Try a different format before bailing out. FVCOM['datetime'] = [datetime.strptime(''.join(i), '%Y/%m/%d %H:%M:%S.%f') for i in FVCOM[key]] done_datetimes = True elif key == 'time': FVCOM['datetime'] = num2date(FVCOM[key], rootgrp.variables[key].units) done_datetimes = True if noisy: if len(str(toExtract)) < 60: print('(extracted {})'.format(str(toExtract).replace('\'', ''))) else: print('(extracted given indices)') elif noisy: print() # Close the open file. rootgrp.close() if atts: return FVCOM, attributes else: return FVCOM
def melt(ncfiles, vars=None, global_atts=None, var_atts=None, coord_vars=None, missing=None): """ Build a (molten) Pandas DataFrame from a series of netcdf files. This is a flexible, but very memory-inneficient data structure, so be careful calling this with large netcdf files. Arguments: ncfiles -- the input filenames vars -- the variables to read, if None all variables in files read var_atts -- variable attributes to include in each line of output, default all global_atts -- global attributes to include in each row of output coord_vars -- variables to treat as coordinates, if None will use variables with the same name as dimensions""" logger = loghelper.get_logger(LOGGER) frames = [] if len(ncfiles) == 1: dataset = Dataset(ncfiles[0]) else: dataset = MFDataset(ncfiles) coord_vars = get_coordinate_vars(dataset, coord_vars) variables = dataset.variables # get global attributes in dataset # shouldn't really use this, but it works dataset_atts = dataset.__dict__ use_global_atts = _lookup(global_atts, dataset_atts, missing) # if no vars specified, use all in ncfiles if (vars == None or vars == ["all"]): vars = list(variables.keys()) # variables are a function of var(reftime,leadtime,height,location) # or var(reftime,leadtime,location) usevars = [v for v in vars if v not in coord_vars] logger.debug("usevars: %s" % usevars) # There must be a clean way of doing this in a general # way, but I don't have the time to code this properly, # so I'm looping over fixed and hard-coded dimension names location = coord_vars['location'] reftime = coord_vars['reftime'] leadtime = coord_vars['leadtime'] height = coord_vars['height'] #lat = coord_vars['lat'] #lon = coord_vars['lon'] nloc = len(location) nreftime = len(reftime) nleadtime = len(leadtime) # dimension order is reftime, leadtime, location, height # or reftime, leadtime, location vars2D = [v for v in usevars if len(variables[v].shape) == 3] vars3D = [v for v in usevars if len(variables[v].shape) == 4] series = [] for v in vars2D: vname = v variable = variables[v] use_var_atts = _lookup(var_atts, variable.__dict__, missing) factors = [reftime, leadtime, [HGT2DNUM], location, [vname]] + map( _listify, use_global_atts.values()) + map(_listify, use_var_atts.values()) names = ['reftime', 'leadtime', 'height', 'location', 'variable' ] + use_global_atts.keys() + use_var_atts.keys() index = pd.MultiIndex.from_product(factors, names=names) #index = pd.MultiIndex.from_tuples([(ref,lead,loc,HGT2DNUM,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location', 'height','variable']) if type(variable[:]) == np.ma.core.MaskedArray: data = variable[:].flatten().filled(np.nan).astype(np.float) else: data = variable[:].flatten().astype(np.float) series.append(pd.Series(data=data, index=index, name='value')) for v in vars3D: variable = variables[v] vname = v use_var_atts = _lookup(var_atts, variable.__dict__, missing) for h, hgt in enumerate(height): subvar = variable[:, :, :, h] vname = "%s.%03d" % (v, hgt) vname = v factors = [reftime, leadtime, [hgt], location, [vname]] + map( _listify, use_global_atts.values()) + map( _listify, use_var_atts.values()) names = ['reftime', 'leadtime', 'height', 'location', 'variable' ] + use_global_atts.keys() + use_var_atts.keys() index = pd.MultiIndex.from_product(factors, names=names) #index = pd.MultiIndex.from_tuples([(ref,lead,loc,hgt,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location','height', 'variable']) if type(subvar) == np.ma.core.MaskedArray: data = subvar[:].flatten().filled(np.nan).astype(np.float) else: data = subvar[:].flatten().astype(np.float) series.append(pd.Series(data=data, index=index, name='value')) # this is molten data, to use Haldey Wickham's terminology # or perhaps 5th normal form? result = pd.concat(series, axis=0).reset_index() return result
def get_tile_dimension(in_files, var_name, transfer_limit_Mbytes=None, time_range=None): ''' Computes the total size of 3D variable array and returns the optimal tile dimension for spatial chunking. :param in_files: absolute path(s) to NetCDF dataset(s) (including OPeNDAP URLs) :type in_files: list :param var_name: variable name to process :type var_name: str :param transfer_limit_Mbytes: maximum OPeNDAP/THREDDS transfer limit in Mbytes (default: None) :type transfer_limit_Mbytes: float :param time_range: time range :type time_range: list of 2 datetime objects: [dt1, dt2] rtype: int .. warning:: only for 3D variables ''' if transfer_limit_Mbytes==None: return 0 else: transfer_limit_bytes = transfer_limit_Mbytes * 1024 * 1024 # Mbytes --> bytes in_files.sort() mfnc = MFDataset(in_files, 'r', aggdim='time') ndim = mfnc.variables[var_name].ndim if ndim != 3: print("ERROR: The variable to process must be 3D") v = mfnc.variables[var_name] v_shape = v.shape v_dtype = v.dtype v_nb_bytes = v_dtype.itemsize if time_range == None: total_array_size_bytes = v_shape[0] * v_shape[1] * v_shape[2] * v_nb_bytes optimal_tile_dimension = int( numpy.sqrt( transfer_limit_bytes / (v.shape[0] * v_nb_bytes) ) ) else: var_time = mfnc.variables['time'] try: time_calend = var_time.calendar except: time_calend = 'gregorian' time_units = var_time.units time_arr = var_time[:] dt_arr = numpy.array([util_dt.num2date(dt, calend=time_calend, units=time_units) for dt in time_arr]) indices_subset = util_dt.get_indices_subset(dt_arr, time_range) nb_time_steps_after_subset = len(indices_subset) total_array_size_bytes = nb_time_steps_after_subset * v_shape[1] * v_shape[2] * v_nb_bytes optimal_tile_dimension = int( numpy.sqrt( transfer_limit_bytes / (nb_time_steps_after_subset * v_nb_bytes) ) ) mfnc.close() return optimal_tile_dimension
def __init__(self, filename=None, name=None, gridfile=None): if filename is None: raise ValueError('Need filename as argument to constructor') filestr = str(filename) if name is None: self.name = filestr else: self.name = name # Due to misspelled standard_name in # some (Akvaplan-NIVA) FVCOM files variable_aliases = { 'eastward_sea_water_velocity': 'x_sea_water_velocity', 'Northward_sea_water_velocity': 'y_sea_water_velocity', 'eastward wind': 'x_wind', 'northward wind': 'y_wind' } # Mapping FVCOM variable names to CF standard_name fvcom_mapping = { 'um': 'x_sea_water_velocity', 'vm': 'y_sea_water_velocity' } self.return_block = True try: # Open file, check that everything is ok logging.info('Opening dataset: ' + filestr) if ('*' in filestr) or ('?' in filestr) or ('[' in filestr): logging.info('Opening files with MFDataset') self.Dataset = MFDataset(filename) else: logging.info('Opening file with Dataset') self.Dataset = Dataset(filename, 'r') except Exception as e: raise ValueError(e) # We are reading and using lon/lat arrays, # and not any projected coordinates self.proj4 = '+proj=latlong' logging.debug('Finding coordinate variables.') # Find x, y and z coordinates # first check if we have specified a separate grid file if gridfile is None: self.gridfile = self.Dataset else: self.gridfile = Dataset(gridfile) logging.info('Opening Grid file') # now check content of grid- or datafile for var_name in self.gridfile.variables: var = self.gridfile.variables[var_name] if var.ndim > 1: continue # Coordinates must be 1D-array attributes = var.ncattrs() standard_name = '' long_name = '' axis = '' units = '' CoordinateAxisType = '' if 'standard_name' in attributes: standard_name = var.__dict__['standard_name'] if 'long_name' in attributes: long_name = var.__dict__['long_name'] if 'axis' in attributes: axis = var.__dict__['axis'] if 'grid' in attributes: grid = var.__dict__['grid'] if 'units' in attributes: units = var.__dict__['units'] if '_CoordinateAxisType' in attributes: CoordinateAxisType = var.__dict__['_CoordinateAxisType'] # read FVCOM Elements/Center grid ( for u and v): if standard_name == 'longitude' and grid == 'Elems' or \ var_name == 'lonc': self.xname = var_name self.numx = var.shape[0] x = var[:] if standard_name == 'latitude' and grid == 'Elems' or \ var_name == 'latc': self.yname = var_name self.numy = var.shape[0] y = var[:] if var_name == 'siglayz_center' and grid == 'Elems': if 'positive' not in var.ncattrs() or \ var.__dict__['positive'] == 'up': self.z = var[:] else: self.z = -var[:] # todo: read FVCOM Vertices grid ( for tracers) # self.lon = x self.lat = y # Find all variables having standard_name self.variable_mapping = {} for var_name in self.Dataset.variables: if var_name in [self.xname, self.yname, 'depth']: continue # Skip coordinate variables var = self.Dataset.variables[var_name] attributes = var.ncattrs() standard_name = '' long_name = '' axis = '' units = '' CoordinateAxisType = '' if 'standard_name' in attributes: standard_name = var.__dict__['standard_name'] if 'long_name' in attributes: long_name = var.__dict__['long_name'] if 'axis' in attributes: axis = var.__dict__['axis'] if 'grid' in attributes: grid = var.__dict__['grid'] if 'units' in attributes: units = var.__dict__['units'] if standard_name == 'time' or axis == 'T' or var_name == 'time': # Read and store time coverage (of this particular file) time = var[:] time_units = units self.times = num2date(time, time_units) self.start_time = self.times[0] self.end_time = self.times[-1] if len(self.times) > 1: self.time_step = self.times[1] - self.times[0] else: self.time_step = None if 'standard_name' in attributes: standard_name = str(var.__dict__['standard_name']) if standard_name in variable_aliases: # Mapping if needed standard_name = variable_aliases[standard_name] self.variable_mapping[standard_name] = str(var_name) elif var_name in fvcom_mapping: self.variable_mapping[fvcom_mapping[var_name]] = \ str(var_name) self.variables = self.variable_mapping.keys() self.xmin = self.lon.min() self.xmax = self.lon.max() self.ymin = self.lat.min() self.ymax = self.lat.max() # Run constructor of parent Reader class super(Reader, self).__init__()
cp_ocean = 3992.10322329649 # Read 'descriptor' and 'years' from external file f = open("files.txt") for line in f.readlines(): exec(line.lstrip()) f.close() model_label = "%s (%s)" % (descriptor,years) # TMPDIR where input files are located tmpdir = "./" # Open input files #fstatic = Dataset(tmpdir+'19000101.ocean_geometry.nc', 'r') fstatic = Dataset(tmpdir+'ocean_annual.static.nc', 'r') ftemp = MFDataset(tmpdir+'ocean_annual.*.temp.nc') fsalt = MFDataset(tmpdir+'ocean_annual.*.salt.nc') # Time info time = ftemp.variables["time"] ntimes = len(time[:]) date = num2date(time,time.units,time.calendar.lower()) year = [d.year for d in date] time_days = date2num(date,'days since 01-01-0001',time.calendar.lower()) # Grid info #area = fstatic.variables["Ah"][:] area = fstatic.variables["area_t"][:] z = ftemp.variables["zl"][:] nz = len(z)
def get_data(exp, area, out): if out == '': s = Dataset(exp + '/ice_month.nc') else: s = MFDataset(exp + '/ice_month*.nc') print 'Reading file', exp tm = len(s.variables['time'][:]) SW = np.zeros(tm) LW = np.zeros(tm) SH = np.zeros(tm) LH = np.zeros(tm) HS = np.zeros(tm) # snow thick SV = np.zeros(tm) # snow vol SF = np.zeros(tm) # snow fall HI = np.zeros(tm) # ice thick IV = np.zeros(tm) # ice vol FRA = np.zeros(tm) BHEAT = np.zeros(tm) BMELT = np.zeros(tm) SST = np.zeros(tm) SSS = np.zeros(tm) ALB = np.zeros(tm) SALTF = np.zeros(tm) time = np.zeros(tm) for t in range(tm): time[t] = s.variables['time'][t] / 365. print 'Time (years):', time[t] hi_tmp = s.variables['HI'][t, :] sst_tmp = s.variables['SST'][t, :] sss_tmp = s.variables['SSS'][t, :] sw_tmp = s.variables['SW'][t, :] lw_tmp = s.variables['LW'][t, :] sh_tmp = s.variables['SH'][t, :] lh_tmp = s.variables['LH'][t, :] hs_tmp = s.variables['HS'][t, :] fra_tmp = s.variables['FRAZIL'][t, :] bh_tmp = s.variables['BHEAT'][t, :] bm_tmp = s.variables['BMELT'][t, :] saltf_tmp = s.variables['SALTF'][t, :] sf_tmp = s.variables['SNOWFL'][t, :] alb_tmp = s.variables['ALB'][t, :] HI[t] = (area * hi_tmp).sum() / area.sum() SSS[t] = (area * sss_tmp).sum() / area.sum() SST[t] = (area * sst_tmp).sum() / area.sum() BHEAT[t] = (area * bh_tmp).sum() BMELT[t] = (area * bm_tmp).sum() HS[t] = (area * hs_tmp).sum() // area.sum() SW[t] = (area * sw_tmp).sum() LW[t] = (area * lw_tmp).sum() LH[t] = (area * lh_tmp).sum() SH[t] = (area * sh_tmp).sum() FRA[t] = (area * fra_tmp).sum() ALB[t] = (area * alb_tmp).sum() / area.sum() IV[t] = (area * hi_tmp).sum() SV[t] = (area * hs_tmp).sum() SALTF[t] = (area * saltf_tmp).sum() print 'SSS, SST, HI, ALB, FRAZIL: ', SSS[t], SST[t], HI[t], ALB[ t], FRA[t] s.close() return SSS, SST, HI, BHEAT, BMELT, IV, HS, SV, SH, LH, SW, LW, FRA, ALB, SALTF, time
def readFVCOM(file, varList=None, clipDims=False, noisy=False, atts=False): """ Read in the FVCOM results file and spit out numpy arrays for each of the variables specified in the varList list. Optionally specify a dict with keys whose names match the dimension names in the NetCDF file and whose values are strings specifying alternative ranges or lists of indices. For example, to extract the first hundred time steps, supply clipDims as: clipDims = {'time':'0:100'} To extract the first, 400th and 10,000th values of any array with nodes: clipDims = {'node':'[0, 3999, 9999]'} Any dimension not given in clipDims will be extracted in full. Specify atts=True to extract the variable attributes. Parameters ---------- file : str, list If a string, the full path to an FVCOM NetCDF output file. If a list, a series of files to be loaded. Data will be concatenated into a single dict. varList : list, optional List of variable names to be extracted. If omitted, all variables are returned. clipDims : dict, optional Dict whose keys are dimensions and whose values are a string of either a range (e.g. {'time':'0:100'}) or a list of individual indices (e.g. {'time':'[0, 1, 80, 100]'}). Slicing is supported (::5 for every fifth value) but it is not possible to extract data from the end of the array with a negative index (e.g. 0:-4). noisy : bool, optional Set to True to enable verbose output. atts : bool, optional Set to True to enable output of the attributes (defaults to False). Returns ------- FVCOM : dict Dict of data extracted from the NetCDF file. Keys are those given in varList and the data are stored as ndarrays. attributes : dict, optional If atts=True, returns the attributes as a dict for each variable in varList. The key 'dims' contains the array dimensions (each variable contains the names of its dimensions) as well as the shape of the dimensions defined in the NetCDF file. The key 'global' contains the global attributes. See Also -------- readProbes : read in FVCOM ASCII probes output files. """ # If we have a list, assume it's lots of files and load them all. if isinstance(file, list): try: try: rootgrp = MFDataset(file, 'r') except IOError as msg: raise IOError('Unable to open file {} ({}). Aborting.'.format(file, msg)) except: # Try aggregating along a 'time' dimension (for POLCOMS, for example) try: rootgrp = MFDataset(file, 'r', aggdim='time') except IOError as msg: raise IOError('Unable to open file {} ({}). Aborting.'.format(file, msg)) else: rootgrp = Dataset(file, 'r') # Create a dict of the dimension names and their current sizes dims = {} for key, var in list(rootgrp.dimensions.items()): # Make the dimensions ranges so we can use them to extract all the # values. dims[key] = '0:' + str(len(var)) # Compare the dimensions in the NetCDF file with those provided. If we've # been given a dict of dimensions which differs from those in the NetCDF # file, then use those. if clipDims: commonKeys = set(dims).intersection(list(clipDims.keys())) for k in commonKeys: dims[k] = clipDims[k] if noisy: print("File format: {}".format(rootgrp.file_format)) if not varList: varList = iter(list(rootgrp.variables.keys())) FVCOM = {} # Save the dimensions in the attributes dict. if atts: attributes = {} attributes['dims'] = dims attributes['global'] = {} for g in rootgrp.ncattrs(): attributes['global'][g] = getattr(rootgrp, g) for key, var in list(rootgrp.variables.items()): if noisy: print('Found ' + key, end=' ') sys.stdout.flush() if key in varList: vDims = rootgrp.variables[key].dimensions toExtract = [dims[d] for d in vDims] # If we have no dimensions, we must have only a single value, in # which case set the dimensions to empty and append the function to # extract the value. if not toExtract: toExtract = '.getValue()' # Thought I'd finally figured out how to replace the eval approach, # but I still can't get past the indexing needed to be able to # subset the data. # FVCOM[key] = rootgrp.variables.get(key)[0:-1] # I know, I know, eval() is evil. getData = 'rootgrp.variables[\'{}\']{}'.format(key, str(toExtract).replace('\'', '')) FVCOM[key] = eval(getData) # Add the units and dimensions for this variable to the list of # attributes. if atts: attributes[key] = {} try: attributes[key]['units'] = rootgrp.variables[key].units except: pass try: attributes[key]['dims'] = rootgrp.variables[key].dimensions except: pass if noisy: if len(str(toExtract)) < 60: print('(extracted {})'.format(str(toExtract).replace('\'', ''))) else: print('(extracted given indices)') elif noisy: print() # Close the open file. rootgrp.close() if atts: return FVCOM, attributes else: return FVCOM
def __init__(self, filename=None, name=None, proj4=None): if filename is None: raise ValueError('Need filename as argument to constructor') filestr = str(filename) if name is None: self.name = filestr else: self.name = name try: # Open file, check that everything is ok logging.info('Opening dataset: ' + filestr) if ('*' in filestr) or ('?' in filestr) or ('[' in filestr): logging.info('Opening files with MFDataset') if has_xarray: self.Dataset = xr.open_mfdataset(filename) else: self.Dataset = MFDataset(filename) else: logging.info('Opening file with Dataset') if has_xarray: self.Dataset = xr.open_dataset(filename) else: self.Dataset = Dataset(filename, 'r') except Exception as e: raise ValueError(e) logging.debug('Finding coordinate variables.') if proj4 is not None: # If user has provided a projection apriori self.proj4 = proj4 # Find x, y and z coordinates for var_name in self.Dataset.variables: logging.debug('Parsing variable: ' + var_name) var = self.Dataset.variables[var_name] #if var.ndim > 1: # continue # Coordinates must be 1D-array if has_xarray: attributes = var.attrs att_dict = var.attrs else: attributes = var.ncattrs() att_dict = var.__dict__ standard_name = '' long_name = '' axis = '' units = '' CoordinateAxisType = '' if not hasattr(self, 'proj4'): for att in attributes: if 'proj4' in att: if has_xarray: self.proj4 = str(att_dict[att]) else: self.proj4 = str(var.__getattr__(att)) else: if 'grid_mapping_name' in att: mapping_dict = att_dict logging.debug( ('Parsing CF grid mapping dictionary:' ' ' + str(mapping_dict))) try: self.proj4, proj =\ proj_from_CF_dict(mapping_dict) except: logging.info('Could not parse CF grid_mapping') if 'standard_name' in attributes: standard_name = att_dict['standard_name'] if 'long_name' in attributes: long_name = att_dict['long_name'] if 'axis' in attributes: axis = att_dict['axis'] if 'units' in attributes: units = att_dict['units'] if '_CoordinateAxisType' in attributes: CoordinateAxisType = att_dict['_CoordinateAxisType'] # has_xarray checks in each case below to avoid loading # data if it isn't a coord # is there a better way?? if standard_name == 'longitude' or \ CoordinateAxisType == 'Lon' or \ long_name.lower() == 'longitude': if has_xarray: var_data = var.values else: var_data = var[:] self.lon = var_data lon_var_name = var_name if standard_name == 'latitude' or \ CoordinateAxisType == 'Lat' or \ long_name.lower() == 'latitude': if has_xarray: var_data = var.values else: var_data = var[:] self.lat = var_data lat_var_name = var_name if axis == 'X' or \ standard_name == 'projection_x_coordinate': self.xname = var_name # Fix for units; should ideally use udunits package if units == 'km': unitfactor = 1000 elif units == '100 km': unitfactor = 100000 else: unitfactor = 1 if has_xarray: var_data = var.values else: var_data = var[:] x = var_data * unitfactor self.numx = var_data.shape[0] if axis == 'Y' or \ standard_name == 'projection_y_coordinate': self.yname = var_name # Fix for units; should ideally use udunits package if units == 'km': unitfactor = 1000 elif units == '100 km': unitfactor = 100000 else: unitfactor = 1 self.unitfactor = unitfactor if has_xarray: var_data = var.values else: var_data = var[:] y = var_data * unitfactor self.numy = var_data.shape[0] if standard_name == 'depth' or axis == 'Z': if has_xarray: var_data = var.values else: var_data = var[:] if var_data.ndim == 1: if 'positive' not in attributes or \ att_dict['positive'] == 'up': self.z = var_data else: self.z = -var_data if standard_name == 'time' or axis == 'T' or var_name in [ 'time', 'vtime' ]: # Read and store time coverage (of this particular file) if has_xarray: var_data = var.values else: var_data = var[:] time = var_data time_units = units if has_xarray: self.times = [ datetime.utcfromtimestamp( (OT - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')) for OT in time ] else: self.times = num2date(time, time_units) self.start_time = self.times[0] self.end_time = self.times[-1] if len(self.times) > 1: self.time_step = self.times[1] - self.times[0] else: self.time_step = None if standard_name == 'realization': if has_xarray: var_data = var.values else: var_data = var[:] self.realizations = var_data logging.debug('%i ensemble members available' % len(self.realizations)) if 'x' not in locals(): if self.lon.ndim == 1: x = self.lon[:] self.xname = lon_var_name self.numx = len(x) else: raise ValueError('Did not find x-coordinate variable') if 'y' not in locals(): if self.lat.ndim == 1: y = self.lat[:] self.yname = lat_var_name self.numy = len(y) else: raise ValueError('Did not find y-coordinate variable') if not hasattr(self, 'unitfactor'): self.unitfactor = 1 if 'x' in locals() and 'y' in locals(): self.xmin, self.xmax = x.min(), x.max() self.ymin, self.ymax = y.min(), y.max() self.delta_x = np.abs(x[1] - x[0]) self.delta_y = np.abs(y[1] - y[0]) rel_delta_x = (x[1::] - x[0:-1]) rel_delta_x = np.abs( (rel_delta_x.max() - rel_delta_x.min()) / self.delta_x) rel_delta_y = (y[1::] - y[0:-1]) rel_delta_y = np.abs( (rel_delta_y.max() - rel_delta_y.min()) / self.delta_y) if rel_delta_x > 0.05: # Allow 5 % deviation print(rel_delta_x) print(x[1::] - x[0:-1]) raise ValueError('delta_x is not constant!') if rel_delta_y > 0.05: print(rel_delta_y) print(y[1::] - y[0:-1]) raise ValueError('delta_y is not constant!') self.x = x # Store coordinate vectors self.y = y else: if hasattr(self, 'lon') and hasattr(self, 'lat'): logging.info('No projection found, using lon/lat arrays') self.xname = lon_var_name self.yname = lat_var_name else: raise ValueError( 'Neither x/y-coordinates or lon/lat arrays found') if not hasattr(self, 'proj4'): if self.lon.ndim == 1: logging.debug( 'Lon and lat are 1D arrays, assuming latong projection') self.proj4 = '+proj=latlong' elif self.lon.ndim == 2: logging.debug( 'Reading lon lat 2D arrays, since projection is not given') self.lon = self.lon[:] self.lat = self.lat[:] self.projected = False if hasattr(self, 'proj4') and 'latlong' in self.proj4 and hasattr( self, 'xmax') and self.xmax > 360: logging.info('Longitudes > 360 degrees, subtracting 360') self.xmin -= 360 self.xmax -= 360 self.x -= 360 self.x -= 360 # Find all variables having standard_name self.variable_mapping = {} for var_name in self.Dataset.variables: if var_name in [self.xname, self.yname, 'depth']: continue # Skip coordinate variables var = self.Dataset.variables[var_name] if has_xarray: attributes = var.attrs att_dict = var.attrs else: attributes = var.ncattrs() att_dict = var.__dict__ if 'standard_name' in attributes: standard_name = str(att_dict['standard_name']) if standard_name in self.variable_aliases: # Mapping if needed standard_name = self.variable_aliases[standard_name] self.variable_mapping[standard_name] = str(var_name) self.variables = list(self.variable_mapping.keys()) # Run constructor of parent Reader class super(Reader, self).__init__()
def __init__(self, filename=None, name=None): if filename is None: raise ValueError('Filename is missing') filestr = str(filename) if name is None: self.name = filestr else: self.name = name # xarray currently does not handle this type of grid: # https://github.com/pydata/xarray/issues/2233 self.timer_start("open dataset") logger.info('Opening dataset: ' + filestr) if ('*' in filestr) or ('?' in filestr) or ('[' in filestr): logger.info('Opening files with MFDataset') self.dataset = MFDataset(filename) else: logger.info('Opening file with Dataset') self.dataset = Dataset(filename, 'r') self.proj4 = '+proj=lonlat' logger.info('Reading grid and coordinate variables..') self.x, self.y = self.dataset['longitude'][:], self.dataset[ 'latitude'][:] ref_time = datetime.fromisoformat(self.dataset['time'].units[14:33]) self.times = np.array([ ref_time + timedelta(seconds=d.item()) for d in self.dataset['time'][:] ]) self.start_time = self.times[0] self.end_time = self.times[-1] # time steps are not constant self.xmin = np.min(self.x) self.xmax = np.max(self.x) self.ymin = np.min(self.y) self.ymax = np.max(self.y) # levels are the depth of the bottom of each layer. re-assign to middle of layer # for nearest interpolation. self.z = -self.dataset['level'][:] self.z = np.insert(self.z, 0, [0.]) self.z = self.z[:-1] + (np.diff(self.z) / 2) assert len(self.z) == len(self.dataset['level'][:]) self.zmin, self.zmax = np.min(self.z), 0. assert (self.z <= 0).all() self.variable_mapping = {} for var_name in self.dataset.variables: # skipping coordinate variables if var_name in ['time', 'longitude', 'latitude', 'levels']: continue var = self.dataset[var_name] if 'standard_name' in var.ncattrs(): std_name = var.getncattr('standard_name') std_name = self.variable_aliases.get(std_name, std_name) self.variable_mapping[std_name] = str(var_name) self.variables = list(self.variable_mapping.keys()) # Run constructor of parent Reader class super().__init__() self.boundary = self._build_boundary_polygon_(self.x.compressed(), self.y.compressed()) self.timer_start("build index") logger.debug("building index of nodes..") self.nodes_idx = self._build_ckdtree_(self.x, self.y) self.timer_end("build index") self.timer_end("open dataset")
def getMFNcVar(nc_files, keys): ''' Extract variables from a dataset across multiple netCDF files. This function gets the variable contained in a netCDF file and return them into Python nested dictionaries. The first dictionary's key contains the longname, while the second dictionary contains values, standard name (CF), units and the missing data flag. Args: nc_file (str): A name (path) of a netCDF file keys (list): A list of keys to fetch the variables according to the CF standard Returns: dict_out (dict): A dictionary containing the standard names as keys and the associated data as values. ''' # Import the package from netCDF4 import MFDataset # Open the netCDF files nc_fid = MFDataset(nc_files) # Get the variable names nc_vars = [var for var in nc_fid.variables] #Make empty lists to collect the info #longname (should be using the CF conventions) nc_vars_longname = [] #Units nc_vars_units = [] # Get the standard name nc_vars_standardname = [] #Corrections nc_vars_scale_factor = [] nc_vars_add_offset = [] #Missing values nc_vars_missing_value = [] for vars in nc_vars: if 'long_name' in nc_fid.variables[vars].ncattrs(): nc_vars_longname.append( getattr(nc_fid.variables[vars], 'long_name')) else: nc_vars_longname.append(vars) if 'units' in nc_fid.variables[vars].ncattrs(): nc_vars_units.append(getattr(nc_fid.variables[vars], 'units')) else: nc_vars_units.append('NA') if 'standard_name' in nc_fid.variables[vars].ncattrs(): nc_vars_standardname.append( getattr(nc_fid.variables[vars], 'standard_name')) else: nc_vars_standardname.append("NA") if 'scale_factor' in nc_fid.variables[vars].ncattrs(): nc_vars_scale_factor.append( getattr(nc_fid.variables[vars], 'scale_factor')) else: nc_vars_scale_factor.append(1) if 'add_offset' in nc_fid.variables[vars].ncattrs(): nc_vars_add_offset.append( getattr(nc_fid.variables[vars], 'add_offset')) else: nc_vars_add_offset.append(0) if 'missing_value' in nc_fid.variables[vars].ncattrs(): nc_vars_missing_value.append( getattr(nc_fid.variables[vars], 'missing_value')) else: nc_vars_missing_value.append('NA') # Check for the list against the desired variables and output. dict_out = {} for name in nc_vars_longname: if name in keys: f = { 'values': [], 'units': [], 'missing_value': [], 'standard_name': {} } idx = nc_vars_longname.index(name) f['values']=(nc_fid.variables[nc_vars[idx]][:]*nc_vars_scale_factor[idx])\ +nc_vars_add_offset[idx] f['units'] = nc_vars_units[idx] f['missing_value'] = nc_vars_missing_value[idx] f['standard_name'] = nc_vars_standardname[idx] dict_out[name] = f return dict_out