def globsimScaled2Pandas(ncdf_in, station_nr): """ Read a scaled (or interpolated) globsim netCDF file and return all values for one station as a Pandas data frame. ncdf_in: full path to a globsim netCDF (by station) station_nr: station_number, as given in the stations .csv file to identify the station. """ # open file ncf = nc.Dataset(ncdf_in, 'r') # station mask sm = ncf.variables['station'][:] == int(station_nr) # list variables varlist = [x.encode('UTF8') for x in ncf.variables.keys()] # get and convert time time = ncf.variables['time'][:] t_unit = ncf.variables['time'].units t_cal = ncf.variables['time'].calendar time = nc.num2date(time, units=t_unit, calendar=t_cal) # make data frame with time df = pd.DataFrame(data=time, columns=['time']) # add variables for var in varlist: if variables_skip(var): continue data = ncf.variables[var][:, sm] df = pd.concat([df, pd.DataFrame(data=data, columns=[var])], axis=1) return df
def netCDF_empty(self, ncfile_out, stations, nc_in): # TODO: change date type from f4 to f8 for lat and lon ''' Creates an empty station file to hold interpolated reults. The number of stations is defined by the variable stations, variables are determined by the variable list passed from the gridded original netCDF. ncfile_out: full name of the file to be created stations: station list read with common_utils.StationListRead() variables: variables read from netCDF handle lev: list of pressure levels, empty is [] (default) ''' rootgrp = netcdf_base(ncfile_out, len(stations), None, 'hours since 1980-01-01 00:00:00') station = rootgrp["station"] latitude = rootgrp["latitude"] longitude = rootgrp["longitude"] height = rootgrp["height"] # assign station characteristics station[:] = list(stations['station_number']) latitude[:] = list(stations['latitude_dd']) longitude[:] = list(stations['longitude_dd']) height[:] = list(stations['elevation_m']) # extra treatment for pressure level files try: lev = nc_in.variables['level'][:] logger.info("Creating empty 3D file (has pressure levels)") level = rootgrp.createDimension('level', len(lev)) level = rootgrp.createVariable('level', 'i4', ('level')) level.long_name = 'pressure_level' level.units = 'hPa' level[:] = lev except Exception: logger.info("Creating empty 2D file (without pressure levels)") lev = [] # remove extra variables varlist_merra = [str_encode(x) for x in nc_in.variables.keys()] # create and assign variables based on input file for n, var in enumerate(varlist_merra): if variables_skip(var): continue logger.debug(f"Add empty variable: {var}") # extra treatment for pressure level files if len(lev): tmp = rootgrp.createVariable(var, 'f4', ('time', 'level', 'station')) else: tmp = rootgrp.createVariable(var, 'f4', ('time', 'station')) tmp.long_name = str_encode( nc_in.variables[var].long_name) # for merra2 tmp.units = str_encode(nc_in.variables[var].units) # close the file rootgrp.close() logger.debug(f"Created empty netcdf file {ncfile_out}")
def ERA2station(self, ncfile_in, ncfile_out, points, variables=None, date=None): """ Biliner interpolation from fields on regular grid (latitude, longitude) to individual point stations (latitude, longitude). This works for surface and for pressure level files (all Era_Interim files). The type of variable and file structure are determined from the input. This function creates an empty of netCDF file to hold the interpolated results, by calling ERAIgeneric().netCDF_empty. Then, data is interpolated in temporal chunks and appended. The temporal chunking can be set in the interpolation parameter file. Args: ncfile_in: Full path to an Era-Interim derived netCDF file. This can contain wildcards to point to multiple files if temporal chunking was used. ncfile_out: Full path to the output netCDF file to write. points: A dataframe of locations. See method StationListRead in common_utils.py for more details. variables: List of variable(s) to interpolate such as ['r', 't', 'u','v', 't2m', 'u10', 'v10', 'ssrd', 'strd', 'tp']. Defaults to using all variables available. date: Directory to specify begin and end time for the derived time series. Defaluts to using all times available in ncfile_in. cs: chunk size, i.e. how many time steps to interpolate at once. This helps to manage overall memory usage (small cs is slower but less memory intense). """ # read in one type of mutiple netcdf files ncf_in = nc.MFDataset(ncfile_in, 'r', aggdim='time') # is it a file with pressure levels? pl = 'level' in ncf_in.dimensions.keys() # build the output of empty netCDF file rootgrp = new_interpolated_netcdf( ncfile_out, self.stations, ncf_in, time_units='hours since 1900-01-01 00:00:0.0') rootgrp.source = 'ERA_Interim, interpolated bilinearly to stations' rootgrp.close() # open the output netCDF file, set it to be appendable ('a') ncf_out = nc.Dataset(ncfile_out, 'a') # get time and convert to datetime object nctime = ncf_in.variables['time'][:] #"hours since 1900-01-01 00:00:0.0" t_unit = ncf_in.variables['time'].units try: t_cal = ncf_in.variables['time'].calendar except AttributeError: # attribute doesn't exist t_cal = u"gregorian" # standard time = nc.num2date(nctime, units=t_unit, calendar=t_cal) # detect invariant files (topography etc.) invariant = True if len(time) == 1 else False # restrict to date/time range if given if date is None: tmask = time < datetime(3000, 1, 1) else: tmask = (time < date['end']) * (time >= date['beg']) # get time vector for output time_in = nctime[tmask] # ensure that chunk sizes cover entire period even if # len(time_in) is not an integer multiple of cs niter = len(time_in) // self.cs niter += ((len(time_in) % self.cs) > 0) # loop over chunks for n in range(niter): # indices (relative to index of the output file) beg = n * self.cs # restrict last chunk to lenght of tmask plus one (to get last time) end = min(n * self.cs + self.cs, len(time_in)) - 1 # time to make tmask for chunk beg_time = nc.num2date(time_in[beg], units=t_unit, calendar=t_cal) if invariant: # allow topography to work in same code, len(nctime) = 1 end_time = nc.num2date(nctime[0], units=t_unit, calendar=t_cal) #end = 1 else: end_time = nc.num2date(time_in[end], units=t_unit, calendar=t_cal) #'<= end_time', would damage appending tmask_chunk = (time <= end_time) * (time >= beg_time) if invariant: # allow topography to work in same code tmask_chunk = [True] # get the interpolated variables dfield, variables = self.interp2D(ncfile_in, ncf_in, self.stations, tmask_chunk, variables=None, date=None) # append time ncf_out.variables['time'][:] = np.append( ncf_out.variables['time'][:], time_in[beg:end + 1]) #append variables for i, var in enumerate(variables): if variables_skip(var): continue if pl: if ESMFnew: # dfield has dimensions (station, variables, time, pressure levels) ncf_out.variables[var][ beg:end + 1, :, :] = dfield.data[:, i, :, :].transpose( (1, 2, 0)) else: # dimension: time, level, station (pressure level files) ncf_out.variables[var][ beg:end + 1, :, :] = dfield.data[i, :, :, :] else: if ESMFnew: # dfield has dimensions (station, variables, time) ncf_out.variables[var][ beg:end + 1, :] = dfield.data[:, i, :].transpose( (1, 0)) else: # dfield has dimensions time, station (2D files) ncf_out.variables[var][beg:end + 1, :] = dfield.data[i, :, :] #close the file ncf_in.close() ncf_out.close()
def new_interpolated_netcdf(ncfile_out, stations, nc_in, time_units): """ Creates an empty station file to hold interpolated reults. The number of stations is defined by the variable stations, variables are determined by the variable list passed from the gridded original netCDF. ncfile_out: full name of the file to be created stations: station list read with common_utils.StationListRead() variables: variables read from netCDF handle lev: list of pressure levels, empty is [] (default) """ logger.info(f"Creating new file {ncfile_out} from ") rootgrp = netcdf_base(ncfile_out, len(stations), None, time_units, nc_in) station = rootgrp['station'] latitude = rootgrp['latitude'] longitude = rootgrp['longitude'] height = rootgrp['height'] # assign station characteristics station[:] = list(stations['station_number']) latitude[:] = list(stations['latitude_dd']) longitude[:] = list(stations['longitude_dd']) height[:] = list(stations['elevation_m']) # extra treatment for pressure level files try: lev = nc_in.variables['level'][:] logger.info(f"Source dataset is 3D (has pressure levels)") level = rootgrp.createDimension('level', len(lev)) level = rootgrp.createVariable('level', 'i4', ('level')) level.long_name = 'pressure_level' level.units = 'hPa' level[:] = lev except Exception: logger.info(f"Source dataset is 2D (without pressure levels)") lev = [] try: num = rootgrp['number'][:] except Exception: num = [] # create and assign variables based on input file for n, var in enumerate(nc_in.variables): if variables_skip(var): continue # extra treatment for pressure level files if len(num): if len(lev): tmp = rootgrp.createVariable(var,'f4',('time', 'number', 'level', 'station')) else: tmp = rootgrp.createVariable(var,'f4',('time','number', 'station')) else: if len(lev): tmp = rootgrp.createVariable(var,'f4', ('time','level', 'station')) else: tmp = rootgrp.createVariable(var,'f4', ('time','station')) # copy attributes input_var = nc_in.variables[var] for key in input_var.ncattrs(): if key in ['_FillValue']: continue tmp.setncattr(key, getattr(input_var, key)) logger.info(f"Created new empty variable: {str_encode(var)} [{tmp.units}]") return rootgrp
def makeNCF(self, dsi): variables = self.getVars(dsi) dataLev = self.getDataLev(dsi) self.getDimName(dataLev) varf = np.sort( glob.glob(path.join(self.directory, f'*{variables[0]}*'))) ncf = nc.MFDataset(varf.tolist(), aggdim='initial_time0_hours') if dataLev == 'pl': Levs = ncf['lv_ISBL1'][:].data Times = ncf[self.timeName][:] Lats = ncf[self.latName][:].data Lons = ncf[self.lonName][:].data file_new = self.getOutFile(ncf, dataLev) # initialize new data file and create group ncn = nc.Dataset(file_new, 'w', format='NETCDF4_CLASSIC') # make dimensions if dataLev == 'pl': Levs = ncf[self.levName][:].data ncn.createDimension('level', len(Levs)) levels = ncn.createVariable('level', 'i4', ('level', )) levels.long_name = 'pressure level' levels.units = 'mbar' levels[:] = Levs ncn.createDimension('time', len(Times)) ncn.createDimension('latitude', len(Lats)) ncn.createDimension('longitude', len(Lons)) # make dimension variables times = ncn.createVariable('time', 'd', ('time', )) latitudes = ncn.createVariable('latitude', 'f8', ('latitude', )) longitudes = ncn.createVariable('longitude', 'f8', ('longitude', )) times.standard_name = 'time' times.units = ncf[self.timeName].units times.calendar = 'standard' latitudes.standard_name = ncf[self.latName].long_name latitudes.units = ncf[self.latName].units longitudes.standard_name = ncf[self.lonName].long_name longitudes.units = ncf[self.lonName].units ncf.close() # assign dimensions times[:] = Times longitudes[:] = Lons latitudes[:] = Lats for vari in variables: flist = np.sort(glob.glob(path.join(self.directory, f'*{vari}*'))) ncf = nc.MFDataset(flist.tolist(), aggdim=self.timeName) for n, var in enumerate(ncf.variables.keys()): if variables_skip(self.ncfVar[var]): continue logger.info(f"Creating variable: {var}") if dataLev == 'pl': vari = ncn.createVariable(self.ncfVar[var], 'f4', ( 'time', 'level', 'latitude', 'longitude', )) vari[:, :, :, :] = ncf[var][:, :, :, :] else: vari = ncn.createVariable( self.ncfVar[var], 'f4', ('time', 'latitude', 'longitude')) vari[:, :, :] = ncf[var][:, :, :] vari.long_name = ncf[var].long_name vari.units = ncf[var].units ncf.close() for f in flist: remove(f) ncn.close()
def MERRA2station(self, ncfile_in, ncfile_out, points, variables=None, date=None): """ Given the type of variables to interpoalted from MERRA2 downloaded diretory Create the empty of netCDF file to hold the interpolated results, by calling self.netCDF_empty Get the interpolated results from MERRA2station Append all variables into the empty netCDF file Close all files Args: ncfile_in: Full path to an MERRA-2 derived netCDF file. This can contain wildcards to point to multiple files if temporal chunking was used. ncfile_out: Full path to the output netCDF file to write. points: A dataframe of locations. See method StationListRead in common_utils.py for more details. variables: List of variable(s) to interpolate such as ['T','RH','U','V',' T2M', 'U2M', 'V2M', 'U10M', 'V10M', 'PRECTOT', 'SWGDN','SWGDNCLR','LWGDN', 'LWGDNCLR']. Defaults to using all variables available. date: Directory to specify begin and end time for the derived time series. Defaluts to using all times available in ncfile_in. """ # read in one type of mutiple netcdf files logger.info("Loading reanalysis data into memory") ncf_in = nc.MFDataset(ncfile_in, 'r', aggdim='time') # Check station bounds self.validate_stations_extent(ncf_in) # is it a file with pressure levels? pl = 'level' in ncf_in.dimensions.keys() # build the output of empty netCDF file self.netCDF_empty(ncfile_out, self.stations, ncf_in) # open the output netCDF file, set it to be appendable ('a') ncf_out = nc.Dataset(ncfile_out, 'a') # get time and convert to datetime object nctime = ncf_in.variables['time'][:] # "hours since 1980-01-01 00:00:00" t_unit = "hours since 1980-01-01 00:00:00" # ncf_in.variables['time'].units try: t_cal = ncf_in.variables['time'].calendar except AttributeError: # Attribute doesn't exist t_cal = u"gregorian" # or standard # TODO: rm time = [nc.num2date(timei, units=t_unit, calendar=t_cal) for timei in nctime] # TODO: rm time = np.asarray(time) # detect invariant files (topography etc.) invariant = True if len(np.unique(nctime)) <= 2 else False # restrict to date/time range if given if date is None: tmask = nctime < nc.date2num( datetime(3000, 1, 1), units=t_unit, calendar=t_cal) else: beg_num = nc.date2num(date['beg'], units=t_unit, calendar=t_cal) end_num = nc.date2num(date['end'], units=t_unit, calendar=t_cal) tmask = (nctime < end_num) * (nctime >= beg_num) if not any(tmask): sys.exit( '''\n ERROR: No downloaded data exist within date range specified by interpolation control file. Download new data or change 'beg' / 'end' in interpolation control file''' ) # get time indices time_in = nctime[tmask] # ensure that chunk sizes cover entire period even if # len(time_in) is not an integer multiple of cs niter = len(time_in) // self.cs niter += ((len(time_in) % self.cs) > 0) # loop in chunk size cs for n in range(niter): # indices beg = n * self.cs # restrict last chunk to lenght of tmask plus one (to get last time) if invariant: end = beg else: end = min(n * self.cs + self.cs, len(time_in)) - 1 # make tmask for chunk beg_time = time_in[beg] if invariant: # allow topography to work in same code, len(nctime) = 1 # TODO: rm end_time = nc.num2date(nctime[0], units=t_unit, calendar=t_cal) end_time = nctime[0] # end = 1 else: # TODO: rm end_time = nc.num2date(time_in[end], units=t_unit, calendar=t_cal) end_time = time_in[end] # !! CAN'T HAVE '<= end_time', would damage appeding tmask_chunk = (nctime <= end_time) * (nctime >= beg_time) if invariant: # allow topography to work in same code tmask_chunk = np.array([True]) # get the interpolated variables dfield, variables = self.interp2D(ncfile_in, ncf_in, self.stations, tmask_chunk, variables=None, date=None) # append time ncf_out.variables['time'][:] = np.append( ncf_out.variables['time'][:], time_in[beg:end + 1]) # append variables for i, var in enumerate(variables): if variables_skip(var): continue # extra treatment for pressure level files if pl: # lev = ncf_in.variables['level'][:] ncf_out.variables[var][ beg:end + 1, :, :] = dfield.data[:, i, :, :].transpose( 1, 2, 0) else: ncf_out.variables[var][beg:end + 1, :] = dfield.data[:, i, :].transpose( 1, 0) ncf_in.close() ncf_out.close()