def loadCRU_TS(name=dataset_name, grid=None, varlist=None, resolution=None, varatts=None, filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CRU dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) # assemble filelist if filelist is None: # generate default filelist filelist = [orig_ts_file.format(var) for var in varlist if var not in nofile] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, multifile=False, ncformat='NETCDF4_CLASSIC') # replace time axis with number of month since Jan 1979 data = np.arange(0,len(dataset.time),1, dtype='int16') + (1901-1979)*12 # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder dataset = loadObservations(name=name, folder=folder, projection=None, resolution=None, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def loadGPCC_TS(name=dataset_name, grid=None, varlist=None, resolution='25', varatts=None, filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted dataset with the monthly GPCC time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # prepare input if resolution not in ('05', '10', '25'): raise DatasetError, "Selected resolution '%s' is not available!"%resolution # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist filelist = [] if 'p' in varlist: filelist.append(orig_ts_file.format('precip',resolution)) if 's' in varlist: filelist.append(orig_ts_file.format('statio',resolution)) # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, multifile=False, ncformat='NETCDF4_CLASSIC') # replace time axis with number of month since Jan 1979 data = np.arange(0,len(dataset.time),1, dtype='int16') + (1901-1979)*12 # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add GDAL info dataset = addGDALtoDataset(dataset, projection=None, geotransform=None) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution, period=None, lclim=False) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def loadNARR_TS(name=dataset_name, grid=None, varlist=None, resolution=None, varatts=None, filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted NARR dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: varlist = tsvarlist if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist filelist = [orig_ts_file.format(special[var]) if var in special else orig_ts_file.format(var) for var in varlist if var not in nofile and var in varatts] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, atts=projdict, multifile=False, ncformat='NETCDF4_CLASSIC') # replace time axis with number of month since Jan 1979 data = np.arange(0,len(dataset.time),1, dtype='int16') # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection projection = getProjFromDict(projdict, name='{0:s} Coordinate System'.format(name)) dataset = addGDALtoDataset(dataset, projection=projection, geotransform=None, gridfolder=grid_folder) else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder dataset = loadObservations(name=name, folder=folder, projection=None, resolution=None, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def loadGPCC_LTM( name=dataset_name, varlist=None, resolution="025", varatts=ltmvaratts, filelist=None, folder=ltmfolder ): """ Get a properly formatted dataset the monthly accumulated GPCC precipitation climatology. """ # prepare input if resolution not in ("025", "05", "10", "25"): raise DatasetError, "Selected resolution '%s' is not available!" % resolution # translate varlist if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) # load variables separately if "p" in varlist: dataset = DatasetNetCDF( name=name, folder=folder, filelist=["normals_v2011_%s.nc" % resolution], varlist=["p"], varatts=varatts, ncformat="NETCDF4_CLASSIC", ) if "s" in varlist: gauges = nc.Dataset(folder + "normals_gauges_v2011_%s.nc" % resolution, mode="r", format="NETCDF4_CLASSIC") stations = Variable(data=gauges.variables["p"][0, :, :], axes=(dataset.lat, dataset.lon), **varatts["s"]) # consolidate dataset dataset.addVariable(stations, asNC=False, copy=True) dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic # return formatted dataset return dataset
def loadNARR_LTM(name=dataset_name, varlist=None, grid=None, interval='monthly', varatts=None, filelist=None, folder=ltmfolder): ''' Get a properly formatted dataset of daily or monthly NARR climatologies (LTM). ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # prepare input if varatts is None: varatts = ltmvaratts.copy() if varlist is None: varlist = ltmvarlist if interval == 'monthly': pfx = '.mon.ltm.nc'; tlen = 12 elif interval == 'daily': pfx = '.day.ltm.nc'; tlen = 365 else: raise DatasetError, "Selected interval '%s' is not supported!"%interval # translate varlist if varlist and varatts: varlist = translateVarNames(varlist, varatts) # axes dictionary, primarily to override time axis axes = dict(time=Axis(name='time',units='day',coord=(1,tlen,tlen)),load=True) if filelist is None: # generate default filelist filelist = [special[var]+pfx if var in special else var+pfx for var in varlist if var not in nofile] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, axes=axes, atts=projdict, multifile=False, ncformat='NETCDF4_CLASSIC') # add projection projection = getProjFromDict(projdict, name='{0:s} Coordinate System'.format(name)) dataset = addGDALtoDataset(dataset, projection=projection, geotransform=None, folder=grid_folder) else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder raise NotImplementedError, "Need to implement loading neatly formatted and regridded time-series!" # return formatted dataset return dataset
def loadPCIC_LTM(name=dataset_name, varlist=None, varatts=ltmvaratts, filelist=None, folder=ltmfolder): ''' Get a properly formatted dataset the monthly PCIC PRISM climatology. ''' # translate varlist if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) # generate file list filelist = [ ltmfile.format(var) for var in varlist if var not in ('time', 'lat', 'lon') ] # load variables separately dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, ncformat='NETCDF4') dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic # return formatted dataset return dataset
def loadGPCC_LTM(name=dataset_name, varlist=None, resolution='025', varatts=ltmvaratts, filelist=None, folder=ltmfolder): ''' Get a properly formatted dataset the monthly accumulated GPCC precipitation climatology. ''' # prepare input if resolution not in ('025', '05', '10', '25'): raise DatasetError, "Selected resolution '%s' is not available!" % resolution # translate varlist if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) # load variables separately if 'p' in varlist: dataset = DatasetNetCDF(name=name, folder=folder, filelist=['normals_v2011_%s.nc' % resolution], varlist=['p'], varatts=varatts, ncformat='NETCDF4_CLASSIC') if 's' in varlist: gauges = nc.Dataset(folder + 'normals_gauges_v2011_%s.nc' % resolution, mode='r', format='NETCDF4_CLASSIC') stations = Variable(data=gauges.variables['p'][0, :, :], axes=(dataset.lat, dataset.lon), **varatts['s']) # consolidate dataset dataset.addVariable(stations, asNC=False, copy=True) dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic # return formatted dataset return dataset
def loadCFSR_TS(name=dataset_name, grid=None, varlist=None, varatts=None, resolution='hires', filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CFSR dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: if resolution == 'hires' or resolution == '03' or resolution == '031': varlist = varlist_hires elif resolution == 'lowres' or resolution == '05': varlist = varlist_lowres if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [hiresfiles[var] for var in varlist if var in hiresfiles] elif resolution == 'lowres' or resolution == '05': files = [lowresfiles[var] for var in varlist if var in lowresfiles] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, check_override=['time'], multifile=False, ncformat='NETCDF4_CLASSIC') # load static data if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [hiresstatic[var] for var in varlist if var in hiresstatic] elif resolution == 'lowres' or resolution == '05': files = [lowresstatic[var] for var in varlist if var in lowresstatic] # load constants, if any (and with singleton time axis) if len(files) > 0: staticdata = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, axes=dict(lon=dataset.lon, lat=dataset.lat), multifile=False, check_override=['time'], ncformat='NETCDF4_CLASSIC') # N.B.: need to override the axes, so that the datasets are consistent if len(staticdata.variables) > 0: for var in staticdata.variables.values(): if not dataset.hasVariable(var.name): var.squeeze() # remove time dimension dataset.addVariable(var, copy=False) # no need to copy... but we can't write to the netcdf file! # replace time axis with number of month since Jan 1979 data = np.arange(0,len(dataset.time),1, dtype='int16') # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
elif mode == 'average_timeseries': # load source periodstr = '{0:4d}-{1:4d}'.format(*period) print('\n') print(' *** Processing Resolution %s from %s *** '%(res,periodstr)) print('\n') source = loadCFSR_TS(resolution=res) print(source) print('\n') # prepare sink filename = avgfile.format('_'+res,'_'+periodstr) if os.path.exists(avgfolder+filename): os.remove(avgfolder+filename) sink = DatasetNetCDF(name='CFSR Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) # start processing climatology CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # shift longitude axis by 180 degrees left (i.e. 0 - 360 -> -180 - 180) CPU.Shift(lon=-180, flush=False) # sync temporary storage with output (sink variable; do not flush!) CPU.sync(flush=False)
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef,GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), period=None, filetype=None) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def loadCRU_TS(name=dataset_name, grid=None, varlist=None, resolution=None, varatts=None, filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CRU dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) # assemble filelist if filelist is None: # generate default filelist filelist = [ orig_ts_file.format(var) for var in varlist if var not in nofile ] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, multifile=False, ncformat='NETCDF4_CLASSIC') # replace time axis with number of month since Jan 1979 data = np.arange(0, len(dataset.time), 1, dtype='int16') + ( 1901 - 1979) * 12 # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder dataset = loadObservations(name=name, folder=folder, projection=None, resolution=None, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def loadGPCC_TS(name=dataset_name, grid=None, varlist=None, resolution='25', varatts=None, filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted dataset with the monthly GPCC time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # prepare input if resolution not in ('05', '10', '25'): raise DatasetError, "Selected resolution '%s' is not available!" % resolution # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: varlist = varatts.keys() if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist filelist = [] if 'p' in varlist: filelist.append(orig_ts_file.format('precip', resolution)) if 's' in varlist: filelist.append(orig_ts_file.format('statio', resolution)) # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=filelist, varlist=varlist, varatts=varatts, multifile=False, ncformat='NETCDF4_CLASSIC') # replace time axis with number of month since Jan 1979 data = np.arange(0, len(dataset.time), 1, dtype='int16') + ( 1901 - 1979) * 12 # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add GDAL info dataset = addGDALtoDataset(dataset, projection=None, geotransform=None) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution, period=None, lclim=False) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef, GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile( dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), ) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp( os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format( periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format( griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts['period'] = periodstr atts['name'] = dataset_name atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format( dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format( dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable( 'length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth( sink, noleap=True if dataset.upper() in ('WRF', 'CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename( tmpfilepath, filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) lclim = False lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, station=stndata.name) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format( stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format( stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts[ 'period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format( stndata.title, dataset_name, mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source #, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
filename = getFileName(grid=grid_name, period=None, name=None, filepattern=tsfile) filepath = avgfolder + filename print(' Saving data to: \'{0:s}\'\n'.format(filepath)) assert os.path.exists(avgfolder) if os.path.exists(filepath): os.remove(filepath) # remove old file # set attributes atts = dict() # collect attributes, but add prefixes atts = uclim.atts.copy() atts['title'] = 'Corrected Time-sries on {:s} Grid'.format( grid_name) # make new dataset sink = DatasetNetCDF(folder=avgfolder, filelist=[filename], atts=atts, mode='w') # sync and write data so far sink.sync() ## correct data (create variables) for varname, var in uclim.variables.iteritems(): print '' print varname # correct time-series variables if var.hasAxis('time'): if varname in CRU_vars: tsvar = cruts[varname] climvar = cruclim[varname] assert tsvar.axisIndex('time') == 1, tsvar assert climvar.axisIndex(
grid_name = grid periodstr = '{0:4d}-{1:4d}'.format(*period) print('\n *** Merging Shape-Averaged Time-Series on {:s} Grid *** \n'.format(grid,)) ## prepare target dataset filename = getFileName(grid=grid_name, period=None, name=None, filepattern=tsfile) filepath = avgfolder + filename print(' Saving data to: \'{0:s}\'\n'.format(filepath)) assert os.path.exists(avgfolder) if os.path.exists(filepath): os.remove(filepath) # remove old file # set attributes atts=dict() # collect attributes, but add prefixes atts = uclim.atts.copy() atts['title'] = 'Corrected Time-sries on {:s} Grid'.format(grid_name) # make new dataset sink = DatasetNetCDF(folder=avgfolder, filelist=[filename], atts=atts, mode='w') # sync and write data so far sink.sync() ## correct data (create variables) for varname,var in uclim.variables.iteritems(): print '' print varname # correct time-series variables if var.hasAxis('time'): if varname in CRU_vars: tsvar = cruts[varname]; climvar = cruclim[varname] assert tsvar.axisIndex('time') == 1, tsvar assert climvar.axisIndex('time') == 1 and var.axisIndex('time') == 1, climvar assert len(tsvar.axes[1])%12 == 0, len(tsvar.axes[1]) assert tsvar.axes[1].coord[0]%12 == 0, tsvar.axes[1].coord[0]
def loadCFSR_TS(name=dataset_name, grid=None, varlist=None, varatts=None, resolution='hires', filelist=None, folder=None, lautoregrid=None): ''' Get a properly formatted CFSR dataset with monthly mean time-series. ''' if grid is None: # load from original time-series files if folder is None: folder = orig_ts_folder # translate varlist if varatts is None: varatts = tsvaratts.copy() if varlist is None: if resolution == 'hires' or resolution == '03' or resolution == '031': varlist = varlist_hires elif resolution == 'lowres' or resolution == '05': varlist = varlist_lowres if varlist and varatts: varlist = translateVarNames(varlist, varatts) if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [ hiresfiles[var] for var in varlist if var in hiresfiles ] elif resolution == 'lowres' or resolution == '05': files = [ lowresfiles[var] for var in varlist if var in lowresfiles ] # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, check_override=['time'], multifile=False, ncformat='NETCDF4_CLASSIC') # load static data if filelist is None: # generate default filelist if resolution == 'hires' or resolution == '03' or resolution == '031': files = [ hiresstatic[var] for var in varlist if var in hiresstatic ] elif resolution == 'lowres' or resolution == '05': files = [ lowresstatic[var] for var in varlist if var in lowresstatic ] # create singleton time axis staticdata = DatasetNetCDF(name=name, folder=folder, filelist=files, varlist=varlist, varatts=varatts, axes=dict(lon=dataset.lon, lat=dataset.lat), multifile=False, check_override=['time'], ncformat='NETCDF4_CLASSIC') # N.B.: need to override the axes, so that the datasets are consistent if len(staticdata.variables) > 0: for var in staticdata.variables.values(): if not dataset.hasVariable(var.name): var.squeeze() # remove time dimension dataset.addVariable( var, copy=False ) # no need to copy... but we can't write to the netcdf file! # replace time axis with number of month since Jan 1979 data = np.arange(0, len(dataset.time), 1, dtype='int16') # month since 1979 (Jan 1979 = 0) timeAxis = Axis(name='time', units='month', coord=data, atts=dict(long_name='Month since 1979-01')) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) # add projection dataset = addGDALtoDataset(dataset, projection=None, geotransform=None, gridfolder=grid_folder) # N.B.: projection should be auto-detected as geographic else: # load from neatly formatted and regridded time-series files if folder is None: folder = avgfolder grid, resolution = checkGridRes(grid, resolution) dataset = loadObservations(name=name, folder=folder, projection=None, resolution=resolution, grid=grid, period=None, varlist=varlist, varatts=varatts, filepattern=tsfile, filelist=filelist, lautoregrid=lautoregrid, mode='time-series') # return formatted dataset return dataset
# load source periodstr = '{0:4d}-{1:4d}'.format(*period) print('\n') print(' *** Processing Resolution %s from %s *** ' % (res, periodstr)) print('\n') source = loadCFSR_TS(resolution=res) print(source) print('\n') # prepare sink filename = avgfile.format('_' + res, '_' + periodstr) if os.path.exists(avgfolder + filename): os.remove(avgfolder + filename) sink = DatasetNetCDF(name='CFSR Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex( period[0] - 1979) / 12 # origin of monthly time-series is at January 1979 # initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) # start processing climatology CPU.Climatology(period=period[1] - period[0], offset=offset, flush=False)
def __init__(self): self.name = 'const' self.atts = dict(orog = dict(name='zs', units='m'), # surface altitude # axes (don't have their own file) class Axes(FileType): ''' A mock-filetype for axes. ''' def __init__(self): self.atts = dict(time = dict(name='time', units='days', offset=-47116, atts=dict(long_name='Month since 1979')), # time coordinate (days since 1979-01-01) # NOTE THAT THE CMIP5 DATASET HAVE DIFFERENT TIME OFFSETS BETWEEN MEMBERS !!! # N.B.: the time coordinate is only used for the monthly time-series data, not the LTM # the time offset is chose such that 1979 begins with the origin (time=0) lon = dict(name='lon', units='deg E'), # west-east coordinate lat = dict(name='lat', units='deg N'), # south-north coordinate plev = dict(name='lev', units='')) # hybrid pressure coordinate self.vars = self.atts.keys() # Time-Series (monthly) def loadCMIP5_TS(experiment=None, name=None, grid=None, filetypes=None, varlist=None, varatts=None, translateVars=None, lautoregrid=None, load3D=False, ignore_list=None, lcheckExp=True, lreplaceTime=True, lwrite=False, exps=None): ''' Get a properly formatted CESM dataset with a monthly time-series. (wrapper for loadCESM)''' return loadCMIP5_All(experiment=experiment, name=name, grid=grid, period=None, station=None, filetypes=filetypes, varlist=varlist, varatts=varatts, translateVars=translateVars, lautoregrid=lautoregrid, load3D=load3D, ignore_list=ignore_list, mode='time-series', lcheckExp=lcheckExp, lreplaceTime=lreplaceTime, lwrite=lwrite, exps=exps) # load minimally pre-processed CESM climatology files def loadCMIP5(experiment=None, name=None, grid=None, period=None, filetypes=None, varlist=None, varatts=None, translateVars=None, lautoregrid=None, load3D=False, ignore_list=None, lcheckExp=True, lreplaceTime=True, lencl=False, lwrite=False, exps=None): ''' Get a properly formatted monthly CESM climatology as NetCDFDataset. ''' return loadCMIP5_All(experiment=experiment, name=name, grid=grid, period=period, station=None, filetypes=filetypes, varlist=varlist, varatts=varatts, translateVars=translateVars, lautoregrid=lautoregrid, load3D=load3D, ignore_list=ignore_list, exps=exps, mode='climatology', lcheckExp=lcheckExp, lreplaceTime=lreplaceTime, lwrite=lwrite) # load any of the various pre-processed CESM climatology and time-series files def loadCMIP5_All(experiment=None, name=None, grid=None, station=None, shape=None, period=None, varlist=None, varatts=None, translateVars=None, lautoregrid=None, load3D=False, ignore_list=None, mode='climatology', cvdp_mode=None, lcheckExp=True, exps=None, lreplaceTime=True, filetypes=None, lencl=False, lwrite=False, check_vars=None): ''' Get any of the monthly CESM files as a properly formatted NetCDFDataset. ''' # period if isinstance(period,(tuple,list)): if not all(isNumber(period)): raise ValueError elif isinstance(period,basestring): period = [int(prd) for prd in period.split('-')] elif isinstance(period,(int,np.integer)) or period is None : pass # handled later else: raise DateError, "Illegal period definition: {:s}".format(str(period)) # prepare input lclim = False; lts = False; lcvdp = False; ldiag = False # mode switches if mode.lower() == 'climatology': # post-processed climatology files lclim = True folder,experiment,name = getFolderName(name=name, experiment=experiment, folder=None, mode='avg', exps=exps) if period is None: raise DateError, 'Currently CESM Climatologies have to be loaded with the period explicitly specified.' elif mode.lower() in ('time-series','timeseries'): # concatenated time-series files lts = True folder,experiment,name = getFolderName(name=name, experiment=experiment, folder=None, mode='avg', exps=exps) lclim = False; period = None; periodstr = None # to indicate time-series (but for safety, the input must be more explicit) if lautoregrid is None: lautoregrid = False # this can take very long! elif mode.lower() == 'cvdp': # concatenated time-series files lcvdp = True folder,experiment,name = getFolderName(name=name, experiment=experiment, folder=None, mode='cvdp', cvdp_mode=cvdp_mode, exps=exps) if period is None: if not isinstance(experiment,Exp): raise DatasetError, 'Periods can only be inferred for registered datasets.' period = (experiment.beginyear, experiment.endyear) elif mode.lower() == 'diag': # concatenated time-series files ldiag = True folder,experiment,name = getFolderName(name=name, experiment=experiment, folder=None, mode='diag', exps=exps) raise NotImplementedError, "Loading AMWG diagnostic files is not supported yet." else: raise NotImplementedError,"Unsupported mode: '{:s}'".format(mode) # cast/copy varlist if isinstance(varlist,basestring): varlist = [varlist] # cast as list elif varlist is not None: varlist = list(varlist) # make copy to avoid interference # handle stations and shapes if station and shape: raise ArgumentError elif station or shape: if grid is not None: raise NotImplementedError, 'Currently CESM station data can only be loaded from the native grid.' if lcvdp: raise NotImplementedError, 'CVDP data is not available as station data.' if lautoregrid: raise GDALError, 'Station data can not be regridded, since it is not map data.' lstation = bool(station); lshape = bool(shape) # add station/shape parameters if varlist: params = stn_params if lstation else shp_params for param in params: if param not in varlist: varlist.append(param) else: lstation = False; lshape = False # period if isinstance(period,(int,np.integer)): if not isinstance(experiment,Exp): raise DatasetError, 'Integer periods are only supported for registered datasets.' period = (experiment.beginyear, experiment.beginyear+period) if lclim: periodstr = '_{0:4d}-{1:4d}'.format(*period) elif lcvdp: periodstr = '{0:4d}-{1:4d}'.format(period[0],period[1]-1) else: periodstr = '' # N.B.: the period convention in CVDP is that the end year is included # generate filelist and attributes based on filetypes and domain if filetypes is None: filetypes = ['atm','lnd'] elif isinstance(filetypes,(list,tuple,set,basestring)): if isinstance(filetypes,basestring): filetypes = [filetypes] else: filetypes = list(filetypes) # interprete/replace WRF filetypes (for convenience) tmp = [] for ft in filetypes: if ft in ('const','drydyn3d','moist3d','rad','plev3d','srfc','xtrm','hydro'): if 'atm' not in tmp: tmp.append('atm') elif ft in ('lsm','snow'): if 'lnd' not in tmp: tmp.append('lnd') elif ft in ('aux'): pass # currently not supported # elif ft in (,): # if 'atm' not in tmp: tmp.append('atm') # if 'lnd' not in tmp: tmp.append('lnd') else: tmp.append(ft) filetypes = tmp; del tmp if 'axes' not in filetypes: filetypes.append('axes') else: raise TypeError atts = dict(); filelist = []; typelist = [] for filetype in filetypes: fileclass = fileclasses[filetype] if lclim and fileclass.climfile is not None: filelist.append(fileclass.climfile) elif lts and fileclass.tsfile is not None: filelist.append(fileclass.tsfile) elif lcvdp and fileclass.cvdpfile is not None: filelist.append(fileclass.cvdpfile) elif ldiag and fileclass.diagfile is not None: filelist.append(fileclass.diagfile) typelist.append(filetype) atts.update(fileclass.atts) # figure out ignore list if ignore_list is None: ignore_list = set(ignore_list_2D) elif isinstance(ignore_list,(list,tuple)): ignore_list = set(ignore_list) elif not isinstance(ignore_list,set): raise TypeError if not load3D: ignore_list.update(ignore_list_3D) if lautoregrid is None: lautoregrid = not load3D # don't auto-regrid 3D variables - takes too long! # translate varlist if varatts is not None: atts.update(varatts) lSST = False if varlist is not None: varlist = list(varlist) if 'SST' in varlist: # special handling of name SST variable, as it is part of Ts varlist.remove('SST') if not 'Ts' in varlist: varlist.append('Ts') lSST = True # Ts is renamed to SST below if translateVars is None: varlist = list(varlist) + translateVarNames(varlist, atts) # also aff translations, just in case elif translateVars is True: varlist = translateVarNames(varlist, atts) # N.B.: DatasetNetCDF does never apply translation! # NetCDF file mode ncmode = 'rw' if lwrite else 'r' # get grid or station-set name if lstation: # the station name can be inserted as the grid name gridstr = '_'+station.lower(); # only use lower case for filenames griddef = None elif lshape: # the station name can be inserted as the grid name gridstr = '_'+shape.lower(); # only use lower case for filenames griddef = None else: if grid is None or grid == experiment.grid: gridstr = ''; griddef = None else: gridstr = '_'+grid.lower() # only use lower case for filenames griddef = loadPickledGridDef(grid=grid, res=None, filename=None, folder=grid_folder, check=True) # insert grid name and period filenames = [] for filetype,fileformat in zip(typelist,filelist): if lclim: filename = fileformat.format(gridstr,periodstr) # put together specfic filename for climatology elif lts: filename = fileformat.format(gridstr) # or for time-series elif lcvdp: filename = fileformat.format(experiment.name if experiment else name,periodstr) # not implemented: gridstr elif ldiag: raise NotImplementedError else: raise DatasetError filenames.append(filename) # append to list (passed to DatasetNetCDF later) # check existance filepath = '{:s}/{:s}'.format(folder,filename) if not os.path.exists(filepath): nativename = fileformat.format('',periodstr) # original filename (before regridding) nativepath = '{:s}/{:s}'.format(folder,nativename) if os.path.exists(nativepath): if lautoregrid: from processing.regrid import performRegridding # causes circular reference if imported earlier griddef = loadPickledGridDef(grid=grid, res=None, folder=grid_folder) dataargs = dict(experiment=experiment, filetypes=[filetype], period=period) print("The '{:s}' (CESM) dataset for the grid ('{:s}') is not available:\n Attempting regridding on-the-fly.".format(name,filename,grid)) if performRegridding('CESM','climatology' if lclim else 'time-series', griddef, dataargs): # default kwargs raise IOError, "Automatic regridding failed!" print("Output: '{:s}'".format(name,filename,grid,filepath)) else: raise IOError, "The '{:s}' (CESM) dataset '{:s}' for the selected grid ('{:s}') is not available - use the regrid module to generate it.".format(name,filename,grid) else: raise IOError, "The '{:s}' (CESM) dataset file '{:s}' does not exits!\n({:s})".format(name,filename,folder) # load dataset #print varlist, filenames if experiment: title = experiment.title else: title = name dataset = DatasetNetCDF(name=name, folder=folder, filelist=filenames, varlist=varlist, axes=None, varatts=atts, title=title, multifile=False, ignore_list=ignore_list, ncformat='NETCDF4', squeeze=True, mode=ncmode, check_vars=check_vars) # replace time axis if lreplaceTime: if lts or lcvdp: # check time axis and center at 1979-01 (zero-based) if experiment is None: ys = period[0]; ms = 1 else: ys,ms,ds = [int(t) for t in experiment.begindate.split('-')]; assert ds == 1 if dataset.hasAxis('time'): ts = (ys-1979)*12 + (ms-1); te = ts+len(dataset.time) # month since 1979 (Jan 1979 = 0) atts = dict(long_name='Month since 1979-01') timeAxis = Axis(name='time', units='month', coord=np.arange(ts,te,1, dtype='int16'), atts=atts) dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) if dataset.hasAxis('year'): ts = ys-1979; te = ts+len(dataset.year) # month since 1979 (Jan 1979 = 0) atts = dict(long_name='Years since 1979-01') yearAxis = Axis(name='year', units='year', coord=np.arange(ts,te,1, dtype='int16'), atts=atts) dataset.replaceAxis(dataset.year, yearAxis, asNC=False, deepcopy=False) elif lclim: if dataset.hasAxis('time') and not dataset.time.units.lower() in monthlyUnitsList: atts = dict(long_name='Month of the Year') timeAxis = Axis(name='time', units='month', coord=np.arange(1,13, dtype='int16'), atts=atts) assert len(dataset.time) == len(timeAxis), dataset.time dataset.replaceAxis(dataset.time, timeAxis, asNC=False, deepcopy=False) elif dataset.hasAxis('year'): raise NotImplementedError, dataset # rename SST if lSST: dataset['SST'] = dataset.Ts # correct ordinal number of shape (should start at 1, not 0) if lshape: # mask all shapes that are incomplete in dataset if lencl and 'shp_encl' in dataset: dataset.mask(mask='shp_encl', invert=True) if dataset.hasAxis('shapes'): raise AxisError, "Axis 'shapes' should be renamed to 'shape'!" if not dataset.hasAxis('shape'): raise AxisError if dataset.shape.coord[0] == 0: dataset.shape.coord += 1 # check if len(dataset) == 0: raise DatasetError, 'Dataset is empty - check source file or variable list!' # add projection, if applicable if not ( lstation or lshape ): dataset = addGDALtoDataset(dataset, griddef=griddef, gridfolder=grid_folder, lwrap360=True, geolocator=True) # return formatted dataset return dataset ## Dataset API dataset_name = 'CMIP5' # dataset name root_folder # root folder of the dataset avgfolder # root folder for monthly averages outfolder # root folder for direct WRF output ts_file_pattern = 'cmip5{0:s}{1:s}_monthly.nc' # filename pattern: filetype, grid clim_file_pattern = 'cmip5{0:s}{1:s}_clim{2:s}.nc' # filename pattern: filetype, grid, period data_folder = root_folder # folder for user data grid_def = {'':None} # there are too many... grid_res = {'':1.} # approximate grid resolution at 45 degrees latitude default_grid = None # functions to access specific datasets loadLongTermMean = None # WRF doesn't have that... loadClimatology = loadCESM # pre-processed, standardized climatology loadTimeSeries = loadCESM_TS # time-series data #loadStationClimatology = loadCESM_Stn # pre-processed, standardized climatology at stations #loadStationTimeSeries = loadCESM_StnTS # time-series data at stations #loadShapeClimatology = loadCESM_Shp # climatologies without associated grid (e.g. provinces or basins) #loadShapeTimeSeries = loadCESM_ShpTS # time-series without associated grid (e.g. provinces or basins) ## (ab)use main execution for quick test if __name__ == '__main__': # set mode/parameters # mode = 'test_climatology' # mode = 'test_timeseries' # mode = 'test_ensemble' # mode = 'test_point_climatology' # mode = 'test_point_timeseries' # mode = 'test_point_ensemble' # mode = 'test_cvdp' mode = 'pickle_grid' # mode = 'shift_lon' # experiments = ['Ctrl-1', 'Ctrl-A', 'Ctrl-B', 'Ctrl-C'] # experiments += ['Ctrl-2050', 'Ctrl-A-2050', 'Ctrl-B-2050', 'Ctrl-C-2050'] experiments = ('Ctrl-1',) periods = (15,) filetypes = ('atm',) # ['atm','lnd','ice'] grids = ('cesm1x1',)*len(experiments) # grb1_d01 # pntset = 'shpavg' pntset = 'ecprecip' from projects.CESM_experiments import Exp, CESM_exps, ensembles # N.B.: importing Exp through CESM_experiments is necessary, otherwise some isinstance() calls fail # pickle grid definition if mode == 'pickle_grid': for grid,experiment in zip(grids,experiments): print('') print(' *** Pickling Grid Definition for {0:s} *** '.format(grid)) print('') # load GridDefinition dataset = loadCESM(experiment=CESM_exps[experiment], grid=None, filetypes=['lnd'], period=(1979,1989)) griddef = dataset.griddef #del griddef.xlon, griddef.ylat print griddef griddef.name = grid print(' Loading Definition from \'{0:s}\''.format(dataset.name)) # save pickle filename = '{0:s}/{1:s}'.format(grid_folder,griddef_pickle.format(grid)) if os.path.exists(filename): os.remove(filename) # overwrite filehandle = open(filename, 'w') pickle.dump(griddef, filehandle) filehandle.close() print(' Saving Pickle to \'{0:s}\''.format(filename)) print('') # load pickle to make sure it is right del griddef griddef = loadPickledGridDef(grid, res=None, folder=grid_folder) print(griddef) print('') print griddef.wrap360
# generate averaged climatology elif mode == 'average_timeseries': # load source periodstr = '%4i-%4i'%period print('\n') print(' *** Processing Grid %s from %s *** '%(grid,periodstr)) print('\n') source = loadNARR_TS() print(source) print('\n') # prepare sink gridstr = '' if grid is 'NARR' else '_'+grid filename = avgfile.format(gridstr,'_'+periodstr) if os.path.exists(avgfolder+filename): os.remove(avgfolder+filename) sink = DatasetNetCDF(name='NARR Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # initialize processing # CPU = CentralProcessingUnit(source, sink, varlist=['precip', 'T2'], tmp=True) # no need for lat/lon CPU = CentralProcessingUnit(source, sink, varlist=None, tmp=True) # no need for lat/lon # start processing climatology CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # sync temporary storage with output CPU.sync(flush=True) # # make new masks
# load source periodstr = '%4i-%4i'%period print('\n') print(' *** Processing Time-series from %s *** '%(periodstr,)) print('\n') source = loadCRU_TS() source = source(time=timeSlice(period)) # only get relevant time-slice print(source) assert period[0] != 1979 or source.time.coord[0] == 0 assert len(source.time) == (period[1]-period[0])*12 print('\n') # prepare sink filename = avgfile.format('','_'+periodstr,) if os.path.exists(avgfolder+filename): os.remove(avgfolder+filename) sink = DatasetNetCDF(name='CRU Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # initialize processing # CPU = CentralProcessingUnit(source, sink, varlist=['wetfrq']) CPU = CentralProcessingUnit(source, sink) # start processing print('') print(' +++ processing +++ ') CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # sync temporary storage with output CPU.sync(flush=False) print('\n')
# load source periodstr = 'Climatology' if period is None else '{0:4d}-{1:4d}'.format(*period) print('\n\n *** Processing Resolution %s from %s *** \n\n'%(res,periodstr)) if period is None: source = loadGPCC_LTM(varlist=None,resolution=res) # ['stations','precip'] else: source = loadGPCC_TS(varlist=None,resolution=res) source = source(time=timeSlice(period)) #source.load() print(source) print('\n') # prepare sink gridstr = res if grid == 'GPCC' else grid filename = getFileName(grid=gridstr, period=period, name='GPCC', filepattern=avgfile) if os.path.exists(avgfolder+filename): os.remove(avgfolder+filename) atts =dict(period=periodstr, name='GPCC', title='GPCC Climatology') sink = DatasetNetCDF(name='GPCC Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') # sink = addGDALtoDataset(sink, griddef=source.griddef) # initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) if period is not None: # determine averaging interval offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 # start processing climatology CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False) # CPU.sync(flush=True) # get NARR coordinates if grid is not 'GPCC': griddef = loadPickledGridDef(grid=grid, res=None, folder=grid_folder)
def loadObservations(name=None, folder=None, period=None, grid=None, station=None, shape=None, lencl=False, varlist=None, varatts=None, filepattern=None, filelist=None, resolution=None, projection=None, geotransform=None, axes=None, lautoregrid=None, mode='climatology'): ''' A function to load standardized observational datasets. ''' # prepare input if mode.lower() == 'climatology': # post-processed climatology files # transform period if period is None or period == '': if name not in ('PCIC','PRISM','GPCC','NARR'): raise ValueError("A period is required to load observational climatologies.") elif isinstance(period,basestring): period = tuple([int(prd) for prd in period.split('-')]) elif not isinstance(period,(int,np.integer)) and ( not isinstance(period,tuple) and len(period) == 2 ): raise TypeError(period) elif mode.lower() in ('time-series','timeseries'): # concatenated time-series files period = None # to indicate time-series (but for safety, the input must be more explicit) if lautoregrid is None: lautoregrid = False # this can take very long! # cast/copy varlist if isinstance(varlist,basestring): varlist = [varlist] # cast as list elif varlist is not None: varlist = list(varlist) # make copy to avoid interference # figure out station and shape options if station and shape: raise ArgumentError() elif station or shape: if grid is not None: raise NotImplementedError('Currently observational station data can only be loaded from the native grid.') if lautoregrid: raise GDALError('Station data can not be regridded, since it is not map data.') lstation = bool(station); lshape = bool(shape) grid = station if lstation else shape # add station/shape parameters if varlist: params = stn_params if lstation else shp_params for param in params: if param not in varlist: varlist.append(param) else: lstation = False; lshape = False # varlist (varlist = None means all variables) if varatts is None: varatts = default_varatts.copy() if varlist is not None: varlist = translateVarNames(varlist, varatts) # filelist if filelist is None: filename = getFileName(name=name, resolution=resolution, period=period, grid=grid, filepattern=filepattern) # check existance filepath = '{:s}/{:s}'.format(folder,filename) if not os.path.exists(filepath): nativename = getFileName(name=name, resolution=resolution, period=period, grid=None, filepattern=filepattern) nativepath = '{:s}/{:s}'.format(folder,nativename) if os.path.exists(nativepath): if lautoregrid: from processing.regrid import performRegridding # causes circular reference if imported earlier griddef = loadPickledGridDef(grid=grid, res=None, folder=grid_folder) dataargs = dict(period=period, resolution=resolution) performRegridding(name, 'climatology',griddef, dataargs) # default kwargs else: raise IOError("The dataset '{:s}' for the selected grid ('{:s}') is not available - use the regrid module to generate it.".format(filename,grid) ) else: raise IOError("The dataset file '{:s}' does not exits!\n('{:s}')".format(filename,filepath)) # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=[filename], varlist=varlist, varatts=varatts, axes=axes, multifile=False, ncformat='NETCDF4') # mask all shapes that are incomplete in dataset if shape and lencl and 'shp_encl' in dataset: dataset.load() # need to load data before masking; is cheap for shape averages, anyway dataset.mask(mask='shp_encl', invert=True, skiplist=shp_params) # correct ordinal number of shape (should start at 1, not 0) if lshape: if dataset.hasAxis('shapes'): raise AxisError("Axis 'shapes' should be renamed to 'shape'!") if not dataset.hasAxis('shape'): raise AxisError() if dataset.shape.coord[0] == 0: dataset.shape.coord += 1 # figure out grid if not lstation and not lshape: if grid is None or grid == name: dataset = addGDALtoDataset(dataset, projection=projection, geotransform=geotransform, gridfolder=grid_folder) elif isinstance(grid,basestring): # load from pickle file # griddef = loadPickledGridDef(grid=grid, res=None, filename=None, folder=grid_folder) # add GDAL functionality to dataset dataset = addGDALtoDataset(dataset, griddef=grid, gridfolder=grid_folder) else: raise TypeError(dataset) # N.B.: projection should be auto-detected, if geographic (lat/lon) return dataset
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) lclim = False; lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments module, dataargs, loadfct, filepath, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # get filename for target dataset and do some checks filename = getTargetFile(stndata.name, dataset, mode, module, dataargs, lwrite) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > sourceage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name; atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(stndata.title,dataset_name,mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source#, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
print('\n') # prepare sink gridstr = res if grid == 'GPCC' else grid filename = getFileName(grid=gridstr, period=period, name='GPCC', filepattern=avgfile) if os.path.exists(avgfolder + filename): os.remove(avgfolder + filename) atts = dict(period=periodstr, name='GPCC', title='GPCC Climatology') sink = DatasetNetCDF(name='GPCC Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') # sink = addGDALtoDataset(sink, griddef=source.griddef) # initialize processing CPU = CentralProcessingUnit(source, sink, tmp=True) if period is not None: # determine averaging interval offset = source.time.getIndex( period[0] - 1979 ) / 12 # origin of monthly time-series is at January 1979 # start processing climatology CPU.Climatology(period=period[1] - period[0], offset=offset,
def loadObservations(name=None, folder=None, period=None, grid=None, station=None, shape=None, lencl=False, varlist=None, varatts=None, filepattern=None, filelist=None, resolution=None, projection=None, geotransform=None, axes=None, lautoregrid=None, mode='climatology'): ''' A function to load standardized observational datasets. ''' # prepare input if mode.lower() == 'climatology': # post-processed climatology files # transform period if period is None or period == '': if name not in ('PCIC','PRISM','GPCC','NARR'): raise ValueError, "A period is required to load observational climatologies." elif isinstance(period,basestring): period = tuple([int(prd) for prd in period.split('-')]) elif not isinstance(period,(int,np.integer)) and ( not isinstance(period,tuple) and len(period) == 2 ): raise TypeError elif mode.lower() in ('time-series','timeseries'): # concatenated time-series files period = None # to indicate time-series (but for safety, the input must be more explicit) if lautoregrid is None: lautoregrid = False # this can take very long! # cast/copy varlist if isinstance(varlist,basestring): varlist = [varlist] # cast as list elif varlist is not None: varlist = list(varlist) # make copy to avoid interference # figure out station and shape options if station and shape: raise ArgumentError elif station or shape: if grid is not None: raise NotImplementedError, 'Currently observational station data can only be loaded from the native grid.' if lautoregrid: raise GDALError, 'Station data can not be regridded, since it is not map data.' lstation = bool(station); lshape = bool(shape) grid = station if lstation else shape # add station/shape parameters if varlist: params = stn_params if lstation else shp_params for param in params: if param not in varlist: varlist.append(param) else: lstation = False; lshape = False # varlist (varlist = None means all variables) if varatts is None: varatts = default_varatts.copy() if varlist is not None: varlist = translateVarNames(varlist, varatts) # filelist if filelist is None: filename = getFileName(name=name, resolution=resolution, period=period, grid=grid, filepattern=filepattern) # check existance filepath = '{:s}/{:s}'.format(folder,filename) if not os.path.exists(filepath): nativename = getFileName(name=name, resolution=resolution, period=period, grid=None, filepattern=filepattern) nativepath = '{:s}/{:s}'.format(folder,nativename) if os.path.exists(nativepath): if lautoregrid: from processing.regrid import performRegridding # causes circular reference if imported earlier griddef = loadPickledGridDef(grid=grid, res=None, folder=grid_folder) dataargs = dict(period=period, resolution=resolution) performRegridding(name, 'climatology',griddef, dataargs) # default kwargs else: raise IOError, "The dataset '{:s}' for the selected grid ('{:s}') is not available - use the regrid module to generate it.".format(filename,grid) else: raise IOError, "The dataset file '{:s}' does not exits!\n('{:s}')".format(filename,filepath) # load dataset dataset = DatasetNetCDF(name=name, folder=folder, filelist=[filename], varlist=varlist, varatts=varatts, axes=axes, multifile=False, ncformat='NETCDF4') # mask all shapes that are incomplete in dataset if shape and lencl and 'shp_encl' in dataset: dataset.load() # need to load data before masking; is cheap for shape averages, anyway dataset.mask(mask='shp_encl', invert=True, skiplist=shp_params) # correct ordinal number of shape (should start at 1, not 0) if lshape: if dataset.hasAxis('shapes'): raise AxisError, "Axis 'shapes' should be renamed to 'shape'!" if not dataset.hasAxis('shape'): raise AxisError if dataset.shape.coord[0] == 0: dataset.shape.coord += 1 # figure out grid if not lstation and not lshape: if grid is None or grid == name: dataset = addGDALtoDataset(dataset, projection=projection, geotransform=geotransform, gridfolder=grid_folder) elif isinstance(grid,basestring): # load from pickle file # griddef = loadPickledGridDef(grid=grid, res=None, filename=None, folder=grid_folder) # add GDAL functionality to dataset dataset = addGDALtoDataset(dataset, griddef=grid, gridfolder=grid_folder) else: raise TypeError # N.B.: projection should be auto-detected, if geographic (lat/lon) return dataset
def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment, Exp): raise TypeError if not isinstance(filetype, basestring): raise TypeError if not isinstance(domain, (np.integer, int)): raise TypeError if periods is not None and not (isinstance(periods, (tuple, list)) and isInt(periods)): raise TypeError if not isinstance(offset, (np.integer, int)): raise TypeError if not isinstance(loverwrite, (bool, np.bool)): raise TypeError if griddef is not None and not isinstance(griddef, GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain, '') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '. format(pidstr, "'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format( pidstr, "'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format( pidstr, tsfile, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath, mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not (filebegin <= begindate <= fileend): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth - 1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate - fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate - 1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format( pidstr, dataset_name, enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr, filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate, enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_' + griddef.name filename = fileclass.climfile.format(domain, gridstr, '_' + periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder + filename tmpfilepath = expfolder + tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp( os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(skipmsg) else: if griddef is None: lregrid = False else: lregrid = True ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr, dataset_name, domain, periodstr) if not lregrid: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format( griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS( experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # if lregrid: addGDALtoDataset(sink, griddef=griddef) # initialize processing CPU = CentralProcessingUnit( source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info( '{0:s} (shifting climatology by {1:d} month, to start with January) \n' .format(pidstr, shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('{:s} --- {:s} --- \n'.format( pidstr, griddef.name)) logger.debug('{:s} --- {:s} --- \n'.format( pidstr, str(griddef))) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = (sink['GHT_Var'].data_array - sink['Z'].data_array**2)**0.5 atts = dict( name='Z_var', units='m', long_name= 'Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = (sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2)**0.5 atts = dict( name='zeta_var', units='1/s', long_name= 'Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict( name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time, ), data=days_per_month, atts=dict( name='length_of_month', units='days', long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format( pidstr, filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload() del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...
def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None): ''' worker function to compute climatologies for given file parameters. ''' # input type checks if not isinstance(experiment,Exp): raise TypeError if not isinstance(filetype,basestring): raise TypeError if not isinstance(domain,(np.integer,int)): raise TypeError if periods is not None and not (isinstance(periods,(tuple,list)) and isInt(periods)): raise TypeError if not isinstance(offset,(np.integer,int)): raise TypeError if not isinstance(loverwrite,(bool,np.bool)): raise TypeError if griddef is not None and not isinstance(griddef,GridDefinition): raise TypeError #if pidstr == '[proc01]': raise TypeError # to test error handling # load source dataset_name = experiment.name fileclass = fileclasses[filetype] # used for target file name tsfile = fileclass.tsfile.format(domain,'') expfolder = experiment.avgfolder filepath = '{:s}/{:s}'.format(expfolder, tsfile) logger.info('\n\n{0:s} *** Processing Experiment {1:<15s} *** '.format(pidstr,"'{:s}'".format(dataset_name)) + '\n{0:s} *** {1:^37s} *** \n'.format(pidstr,"'{:s}'".format(tsfile))) # check file and read begin/enddates if not os.path.exists(filepath): #raise IOError, "Source file '{:s}' does not exist!".format(filepath) # print message and skip skipmsg = "\n{:s} >>> File '{:s}' in dataset '{:s}' is missing --- skipping!".format(pidstr,tsfile,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.warning(skipmsg) # N.B.: this can cause a lot of error messages, when not all files are present else: # if monthly source file exists import netCDF4 as nc ncfile = nc.Dataset(filepath,mode='r') begintuple = ncfile.begin_date.split('-') endtuple = ncfile.end_date.split('-') ncfile.close() # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even # know if we need it, and it creates a lot of overhead # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # figure out start date filebegin = int(begintuple[0]) # first element is the year fileend = int(endtuple[0]) # first element is the year begindate = offset + filebegin if not ( filebegin <= begindate <= fileend ): raise DateError # handle cases where the first month in the record is not January firstmonth = int(begintuple[1]) # second element is the month shift = firstmonth-1 # will be zero for January (01) ## loop over periods if periods is None: periods = [begindate-fileend] # periods.sort(reverse=True) # reverse, so that largest chunk is done first source = None # will later be assigned to the source dataset for period in periods: # figure out period enddate = begindate + period if filebegin > enddate: raise DateError, 'End date earlier than begin date.' if enddate-1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989! # if end date is not available, skip period endmsg = "\n{:s} --- Invalid Period for '{:s}': End Date {:4d} not in File! --- \n".format(pidstr,dataset_name,enddate) endmsg += "{:s} --- ('{:s}')\n".format(pidstr,filepath) logger.info(endmsg) else: ## perform averaging for selected period # determine if sink file already exists, and what to do about it periodstr = '{0:4d}-{1:4d}'.format(begindate,enddate) gridstr = '' if griddef is None or griddef.name is 'WRF' else '_'+griddef.name filename = fileclass.climfile.format(domain,gridstr,'_'+periodstr) if ldebug: filename = 'test_' + filename if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename assert os.path.exists(expfolder) filepath = expfolder+filename tmpfilepath = expfolder+tmpfilename lskip = False # else just go ahead if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if sink file is newer than source file, skip (do not recompute) if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash #print sourceage, age if not lskip: os.remove(filepath) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## begin actual computation beginmsg = "\n{:s} <<< Computing '{:s}' (d{:02d}) Climatology from {:s}".format( pidstr,dataset_name,domain,periodstr) if griddef is None: beginmsg += " >>> \n" else: beginmsg += " ('{:s}' grid) >>> \n".format(griddef.name) logger.info(beginmsg) ## actually load datasets if source is None: source = loadWRF_TS(experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # prepare sink if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w') sink.atts.period = periodstr # initialize processing if griddef is None: lregrid = False else: lregrid = True CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon # start processing climatology if shift != 0: logger.info('{0:s} (shifting climatology by {1:d} month, to start with January) \n'.format(pidstr,shift)) CPU.Climatology(period=period, offset=offset, shift=shift, flush=False) # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller! # reproject and resample (regrid) dataset if lregrid: CPU.Regrid(griddef=griddef, flush=True) logger.info('%s --- '+str(griddef.geotansform)+' --- \n'%(pidstr)) # sync temporary storage with output dataset (sink) CPU.sync(flush=True) # add Geopotential Height Variance if 'GHT_Var' in sink and 'Z_var' not in sink: data_array = ( sink['GHT_Var'].data_array - sink['Z'].data_array**2 )**0.5 atts = dict(name='Z_var',units='m',long_name='Square Root of Geopotential Height Variance') sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts) # add (relative) Vorticity Variance if 'Vorticity_Var' in sink and 'zeta_var' not in sink: data_array = ( sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2 )**0.5 atts = dict(name='zeta_var',units='1/s',long_name='Square Root of Relative Vorticity Variance') sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts) # add names and length of months sink.axisAnnotation('name_of_month', name_of_month, 'time', atts=dict(name='name_of_month', units='', long_name='Name of the Month')) if not sink.hasVariable('length_of_month'): sink += Variable(name='length_of_month', units='days', axes=(sink.time,), data=days_per_month, atts=dict(name='length_of_month',units='days',long_name='Length of Month')) # close... and write results to file sink.sync() sink.close() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this will overwrite the old file # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # clean up (not sure if this is necessary, but there seems to be a memory leak... del sink, CPU; gc.collect() # get rid of these guys immediately # clean up and return if source is not None: source.unload(); del source # N.B.: source is only loaded once for all periods # N.B.: garbage is collected in multi-processing wrapper as well # return return 0 # so far, there is no measure of success, hence, if there is no crash...
source = loadCRU_TS() source = source(time=timeSlice(period)) # only get relevant time-slice print(source) assert period[0] != 1979 or source.time.coord[0] == 0 assert len(source.time) == (period[1] - period[0]) * 12 print('\n') # prepare sink filename = avgfile.format( '', '_' + periodstr, ) if os.path.exists(avgfolder + filename): os.remove(avgfolder + filename) sink = DatasetNetCDF(name='CRU Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w') sink.atts.period = periodstr # determine averaging interval offset = source.time.getIndex( period[0] - 1979) / 12 # origin of monthly time-series is at January 1979 # initialize processing # CPU = CentralProcessingUnit(source, sink, varlist=['wetfrq']) CPU = CentralProcessingUnit(source, sink) # start processing print('') print(' +++ processing +++ ') CPU.Climatology(period=period[1] - period[0],