def loadCFSR_TS(name=dataset_name, grid=None, varlist=None, varatts=None, resolution='hires', filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CFSR dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: if resolution == 'hires' or resolution == '03' or resolution == '031': varlist = varlist_hires elif resolution == 'lowres' or resolution == '05': varlist = varlist_lowres if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [hiresfiles[var] for var in varlist if var in hiresfiles] elif resolution == 'lowres' or resolution == '05': files = [lowresfiles[var] for var in varlist if var in lowresfiles] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, check_override=['time'], multifile=False, ncformat='NETCDF4_CLASSIC') # load static data if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [hiresstatic[var] for var in varlist if var in hiresstatic] elif resolution == 'lowres' or resolution == '05': files = [lowresstatic[var] for var in varlist if var in lowresstatic] # load constants, if any (and with singleton time axis) if len(files) > 0: staticdata = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, axes=dict(lon=dataset.lon, lat=dataset.lat), multifile=False, check_override=['time'], ncformat='NETCDF4_CLASSIC') # N.B.: need to override the axes, so that the datasets are consistent if len(staticdata.variables) > 0: for var in staticdata.variables.values(): if not dataset.hasVariable(var.name): var.squeeze() # remove time dimension dataset.addVariable(var, copy=False) # no need to copy... but we can't write to the netcdf file! # replace time axis with number of month since Jan 1979 data = np.arange(0,len(dataset.time),1, dtype='int16') # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment,Exp): raise TypeError if not isinstance(filetype,basestring): raise TypeError if not isinstance(domain,(np.integer,int)): raise TypeError if periods is not None and not (isinstance(periods,(tuple,list)) and isInt(periods)): raise TypeError if not isinstance(offset,(np.integer,int)): raise TypeError if not isinstance(loverwrite,(bool,np.bool)): raise TypeError if griddef is not None and not isinstance(griddef,GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain,'') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '.format(pidstr,"'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format(pidstr,"'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format(pidstr,tsfile,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath,mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not ( filebegin <= begindate <= fileend ): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth-1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate-fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate-1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format(pidstr,dataset_name,enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr,filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate,enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_'+griddef.name filename = fileclass.climfile.format(domain,gridstr,'_'+periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder+filename tmpfilepath = expfolder+tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr,dataset_name,domain,periodstr) if griddef is None: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format(griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS(experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # initialize processing if griddef is None: lregrid = False else: lregrid = True CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info('{0:s} (shifting climatology by {1:d} month, to start with January) \n'.format(pidstr,shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('%s --- '+str(griddef.geotansform)+' --- \n'%(pidstr)) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = ( sink['GHT_Var'].data_array - sink['Z'].data_array**2 )**0.5 atts = dict(name='Z_var',units='m',long_name='Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = ( sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2 )**0.5 atts = dict(name='zeta_var',units='1/s',long_name='Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU; gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload(); del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef, GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile( dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), ) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp( os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format( periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format( griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts['period'] = periodstr atts['name'] = dataset_name atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format( dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format( dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable( 'length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth( sink, noleap=True if dataset.upper() in ('WRF', 'CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename( tmpfilepath, filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef,GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), period=None, filetype=None) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
# determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) # start processing climatology CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # shift longitude axis by 180 degrees left (i.e. 0 - 360 -> -180 - 180) CPU.Shift(lon=-180, flush=False) # sync temporary storage with output (sink variable; do not flush!) CPU.sync(flush=False) # make new masks if sink.hasVariable('landmask'): sink.mask(sink.landmask, maskSelf=False, varlist=['snow','snowh','zs'], invert=True, merge=False) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) #print ' === month === ' # sink += VarNC(sink.dataset, name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, # atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # close... sink.sync() sink.close() # print dataset print('') print(sink)
def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment, Exp): raise TypeError if not isinstance(filetype, basestring): raise TypeError if not isinstance(domain, (np.integer, int)): raise TypeError if periods is not None and not (isinstance(periods, (tuple, list)) and isInt(periods)): raise TypeError if not isinstance(offset, (np.integer, int)): raise TypeError if not isinstance(loverwrite, (bool, np.bool)): raise TypeError if griddef is not None and not isinstance(griddef, GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain, '') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '. format(pidstr, "'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format( pidstr, "'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format( pidstr, tsfile, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath, mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not (filebegin <= begindate <= fileend): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth - 1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate - fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate - 1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format( pidstr, dataset_name, enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr, filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate, enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_' + griddef.name filename = fileclass.climfile.format(domain, gridstr, '_' + periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder + filename tmpfilepath = expfolder + tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp( os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(skipmsg) else: if griddef is None: lregrid = False else: lregrid = True ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr, dataset_name, domain, periodstr) if not lregrid: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format( griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS( experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # if lregrid: addGDALtoDataset(sink, griddef=griddef) # initialize processing CPU = CentralProcessingUnit( source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info( '{0:s} (shifting climatology by {1:d} month, to start with January) \n' .format(pidstr, shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('{:s} --- {:s} --- \n'.format( pidstr, griddef.name)) logger.debug('{:s} --- {:s} --- \n'.format( pidstr, str(griddef))) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = (sink['GHT_Var'].data_array - sink['Z'].data_array**2)**0.5 atts = dict( name='Z_var', units='m', long_name= 'Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = (sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2)**0.5 atts = dict( name='zeta_var', units='1/s', long_name= 'Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict( name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time, ), data=days_per_month, atts=dict( name='length_of_month', units='days', long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload() del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...
if lshp: for varname in shp_params: var = gpcc025.variables[varname].load() sink.addVariable(var, asNC=True, copy=True, deepcopy=True) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time, ), data=days_per_month, atts=dict(name='length_of_month', units='days', long_name='Length of Month')) # apply higher resolution mask if griddef is not None: sink.mask(sink.landmask, maskSelf=False, varlist=None, skiplist=['prismmask', 'lon2d', 'lat2d'], invert=False,
## add remaining CRU data for varname in ['Q2','pet','cldfrc','wetfrq','frzfrq']: cruprd.variables[varname].load() sink.addVariable(cruprd.variables[varname], asNC=True, copy=True, deepcopy=True) cruprd.variables[varname].unload() sink.variables[varname].atts['source'] = 'CRU' ## add station meta data if lshp: for varname in shp_params: var = gpcc025.variables[varname].load() sink.addVariable(var, asNC=True, copy=True, deepcopy=True) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # apply higher resolution mask if griddef is not None: sink.mask(sink.landmask, maskSelf=False, varlist=None, skiplist=['prismmask','lon2d','lat2d'], invert=False, merge=True) # finalize changes sink.sync() sink.close() print(sink) print('\n Writing to: \'{0:s}\'\n'.format(filename))
def loadCFSR_TS(name=dataset_name, grid=None, varlist=None, varatts=None, resolution='hires', filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CFSR dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: if resolution == 'hires' or resolution == '03' or resolution == '031': varlist = varlist_hires elif resolution == 'lowres' or resolution == '05': varlist = varlist_lowres if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [ hiresfiles[var] for var in varlist if var in hiresfiles ] elif resolution == 'lowres' or resolution == '05': files = [ lowresfiles[var] for var in varlist if var in lowresfiles ] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, check_override=['time'], multifile=False, ncformat='NETCDF4_CLASSIC') # load static data if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [ hiresstatic[var] for var in varlist if var in hiresstatic ] elif resolution == 'lowres' or resolution == '05': files = [ lowresstatic[var] for var in varlist if var in lowresstatic ] # create singleton time axis staticdata = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, axes=dict(lon=dataset.lon, lat=dataset.lat), multifile=False, check_override=['time'], ncformat='NETCDF4_CLASSIC') # N.B.: need to override the axes, so that the datasets are consistent if len(staticdata.variables) > 0: for var in staticdata.variables.values(): if not dataset.hasVariable(var.name): var.squeeze() # remove time dimension dataset.addVariable( var, copy=False ) # no need to copy... but we can't write to the netcdf file! # replace time axis with number of month since Jan 1979 data = np.arange(0, len(dataset.time), 1, dtype='int16') # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
# initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) # start processing climatology CPU.Climatology(period=period[1] - period[0], offset=offset, flush=False) # shift longitude axis by 180 degrees left (i.e. 0 - 360 -> -180 - 180) CPU.Shift(lon=-180, flush=False) # sync temporary storage with output CPU.sync(flush=True) # make new masks if sink.hasVariable('landmask'): sink.mask(sink.landmask, maskSelf=False, varlist=['snow', 'snowh', 'zs'], invert=True, merge=False) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) #print ' === month === ' # sink += VarNC(sink.dataset, name='length_of_month', units='days', axes=(sink.time,), data=days_per_month,