def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment,Exp): raise TypeError if not isinstance(filetype,basestring): raise TypeError if not isinstance(domain,(np.integer,int)): raise TypeError if periods is not None and not (isinstance(periods,(tuple,list)) and isInt(periods)): raise TypeError if not isinstance(offset,(np.integer,int)): raise TypeError if not isinstance(loverwrite,(bool,np.bool)): raise TypeError if griddef is not None and not isinstance(griddef,GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain,'') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '.format(pidstr,"'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format(pidstr,"'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format(pidstr,tsfile,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath,mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not ( filebegin <= begindate <= fileend ): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth-1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate-fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate-1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format(pidstr,dataset_name,enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr,filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate,enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_'+griddef.name filename = fileclass.climfile.format(domain,gridstr,'_'+periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder+filename tmpfilepath = expfolder+tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr,dataset_name,domain,periodstr) if griddef is None: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format(griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS(experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # initialize processing if griddef is None: lregrid = False else: lregrid = True CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info('{0:s} (shifting climatology by {1:d} month, to start with January) \n'.format(pidstr,shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('%s --- '+str(griddef.geotansform)+' --- \n'%(pidstr)) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = ( sink['GHT_Var'].data_array - sink['Z'].data_array**2 )**0.5 atts = dict(name='Z_var',units='m',long_name='Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = ( sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2 )**0.5 atts = dict(name='zeta_var',units='1/s',long_name='Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU; gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload(); del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) lclim = False; lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments module, dataargs, loadfct, filepath, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # get filename for target dataset and do some checks filename = getTargetFile(stndata.name, dataset, mode, module, dataargs, lwrite) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > sourceage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name; atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(stndata.title,dataset_name,mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source#, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
#sink.mask(sink.landmask) #print sink.dataset addLandMask(sink) # create landmask from precip mask #sink.stations.mask(sink.landmask) # mask all fields using the new landmask # add length and names of month addLengthAndNamesOfMonth(sink, noleap=False) # newvar = sink.precip # print # print newvar.name, newvar.masked # print newvar.fillValue # print newvar.data_array.__class__ # print # close... sink.sync() sink.close() # print dataset print('') print(sink) del sink print # # print time coordinate # dataset = loadGPCC(grid=grid,resolution=res,period=period) # print dataset # print # print dataset.time # print # print dataset.time.data_array
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef, GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile( dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), ) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp( os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format( periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format( griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts['period'] = periodstr atts['name'] = dataset_name atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format( dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format( dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable( 'length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth( sink, noleap=True if dataset.upper() in ('WRF', 'CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename( tmpfilepath, filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # initialize processing # CPU = CentralProcessingUnit(source, sink, varlist=['precip', 'T2'], tmp=True) # no need for lat/lon CPU = CentralProcessingUnit(source, sink, varlist=None, tmp=True) # no need for lat/lon # start processing climatology CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # sync temporary storage with output CPU.sync(flush=True) # # make new masks # sink.mask(sink.landmask, maskSelf=False, varlist=['snow','snowh','zs'], invert=True, merge=False) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) #print ' === month === ' # sink += VarNC(sink.dataset, name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, # atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # close... sink.sync() sink.close() # print dataset print('') print(sink)
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) lclim = False lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, station=stndata.name) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format( stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format( stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts[ 'period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format( stndata.title, dataset_name, mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source #, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef,GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), period=None, filetype=None) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment, Exp): raise TypeError if not isinstance(filetype, basestring): raise TypeError if not isinstance(domain, (np.integer, int)): raise TypeError if periods is not None and not (isinstance(periods, (tuple, list)) and isInt(periods)): raise TypeError if not isinstance(offset, (np.integer, int)): raise TypeError if not isinstance(loverwrite, (bool, np.bool)): raise TypeError if griddef is not None and not isinstance(griddef, GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain, '') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '. format(pidstr, "'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format( pidstr, "'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format( pidstr, tsfile, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath, mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not (filebegin <= begindate <= fileend): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth - 1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate - fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate - 1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format( pidstr, dataset_name, enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr, filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate, enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_' + griddef.name filename = fileclass.climfile.format(domain, gridstr, '_' + periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder + filename tmpfilepath = expfolder + tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp( os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(skipmsg) else: if griddef is None: lregrid = False else: lregrid = True ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr, dataset_name, domain, periodstr) if not lregrid: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format( griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS( experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # if lregrid: addGDALtoDataset(sink, griddef=griddef) # initialize processing CPU = CentralProcessingUnit( source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info( '{0:s} (shifting climatology by {1:d} month, to start with January) \n' .format(pidstr, shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('{:s} --- {:s} --- \n'.format( pidstr, griddef.name)) logger.debug('{:s} --- {:s} --- \n'.format( pidstr, str(griddef))) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = (sink['GHT_Var'].data_array - sink['Z'].data_array**2)**0.5 atts = dict( name='Z_var', units='m', long_name= 'Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = (sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2)**0.5 atts = dict( name='zeta_var', units='1/s', long_name= 'Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict( name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time, ), data=days_per_month, atts=dict( name='length_of_month', units='days', long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload() del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...