def cdrop(obj, rm=True, force=False): """ Deletes the cached file for a CliMAF object, if it exists Args: obj (cobject or string) : object to delete, or its string representation (CRS) force (bool) : should we delete the object even if it is 'protected' rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index Returns: None if object does not exists, False if failing to delete, True if OK Example :: >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> f=cfile(dg) >>> os.system('ls -al '+f) >>> cdrop(dg) """ global crs2filename global dropped_crs if (isinstance(obj, cobject)): crs = ` obj ` if (isinstance(obj, cdataset)): crs = "select(" + crs + ")" elif type(obj) is str: crs = obj else: clogger.error("%s is not a CliMAF object" % ` obj `) return if crs in crs2filename: clogger.info("Discarding cached value for %s (expect if protected)" % crs) fil = crs2filename[crs] if rm: try: if force: os.system("chmod +w " + fil) if not os.access(fil, os.W_OK): clogger.info("Object %s is protected" % crs) return path_file = os.path.dirname(fil) os.remove(fil) crs2filename.pop(crs) dropped_crs.append(crs) try: os.rmdir(path_file) except OSError as ex: clogger.warning(ex) return True except: clogger.warning( "When trying to remove %s : file does not exist in cache" % crs) return False else: clogger.info("%s is not cached" % crs) return None
def set_variable(obj, varname, format) : """ Change to VARNAME the variable name for OBJ, which FORMAT maybe 'file' or 'MaskedArray'. Also set the variable long_name using CF convention (TBD) """ if obj is None : return None long_name=CFlongname(varname) if (format=='file') : oldvarname=varOfFile(obj) if (oldvarname != varname) : command="ncrename -v %s,%s %s >/dev/null 2>&1"%(oldvarname,varname,obj) if ( os.system(command) != 0 ) : clogger.error("Issue with changing varname to %s in %s"%(varname,obj)) return None clogger.debug("Varname changed to %s in %s"%(varname,obj)) command="ncatted -a long_name,%s,o,c,%s %s"%(varname,long_name,obj) if ( os.system(command) != 0 ) : clogger.error("Issue with changing long_name for var %s in %s"% (varname,obj)) return None return True elif (format=='MaskedArray') : clogger.warning('TBD - Cannot yet set the varname for MaskedArray') else : clogger.error('Cannot handle format %s'%format)
def csync(update=False) : """ Write cache dictionary to disk If arg `update` is True, first updates dictionary from actual cache file content """ import pickle global cacheIndexFileName # check if cache index is up to date; if not the # function 'rebuild' is called if update : clogger.warning("Listing crs from files present in cache") crs_in_cache=list_cache() crs_in_cache.sort() crs_in_index=crs2filename.values() crs_in_index.sort() if crs_in_index != crs_in_cache: clogger.warning("Rebuilding cache index") rebuild() # Save to disk cacheIndexFile=file(os.path.expanduser(cacheIndexFileName), "w") pickle.dump(crs2filename,cacheIndexFile) cacheIndexFile.close()
def generateUniqueFileName_safe(expression, operator=None, format="nc"): """ Generate a filename path from string EXPRESSION and FILEFORMAT, unique for the expression and the set of cache directories currently listed in cache.cachedirs OPERATOR may be a function that provides a prefix, using EXPRESSION This uses hashlib.sha224, which are truncated to 3 (or more) characters. More characters are used if a shorter name is already in use for another expression in one of the known cache directories Generated names drive a structure where each directory name 1 or 2 characters and file names have no more characters Exits if uniqueness is unachievable (quite unexpectable !) """ # if format is None: return "" prefix = "" if operator is not None: prefix2 = operator(expression) if prefix2 is not None: prefix = prefix2 + "/" full = hashlib.sha224(expression).hexdigest() number = fileNameLength guess = full[0:number - 1] existing = searchFile(prefix + stringToPath(guess, directoryNameLength) + "." + format) if existing: readCRS = getCRS(existing) # Update index if needed if readCRS not in crs2filename: clogger.warning( "existing data %s in file %s was not yet registered in cache index" % (readCRS, existing)) crs2filename[readCRS] = existing while (existing is not None) and (readCRS != expression): clogger.debug("must skip %s which CRS is %s" % (existing, getCRS(existing))) number += 2 if number >= len(full): clogger.critical("Critical issue in cache : " + len(full) + " digits is not enough for " + expression) exit guess = full[0:number - 1] existing = searchFile(prefix + stringToPath(guess, directoryNameLength) + "." + format) if existing: readCRS = getCRS(existing) rep = currentCache + "/" + prefix + stringToPath( full[0:number - 1], directoryNameLength) + "." + format rep = os.path.expanduser(rep) # Create the relevant directory, so that user scripts don't have to care dirn = os.path.dirname(rep) if not os.path.exists(dirn): os.makedirs(dirn) clogger.debug("returning %s" % rep) return rep
def csync(update=False): """ Merges current in-memory cache index and current on-file cache index for updating both If arg `update` is True, additionally ensures consistency between files set and index content, either : - if cache.stamping is true, by reading CRS in all files - else, by removing files which are not in the index; this may erase result files which have been computed by another running instance of CliMAF """ # import pickle global cacheIndexFileName global dropped_crs # Merge index on file and index in memory file_index = cload(True) for crs in dropped_crs: file_index.pop(crs, None) crs2filename.update(file_index) # check if cache index is up to date; if not enforce consistency if update: clogger.info("Listing crs from files present in cache") files_in_cache = list_cache() files_in_cache.sort() files_in_index = crs2filename.values() files_in_index.sort() if files_in_index != files_in_cache: if stamping: clogger.info("Rebuilding cache index from file content") rebuild() else: clogger.warning( 'In no stamp mode, there is no way to seriously identify CRS from files in cache !' ) # clogger.warning('Removing cache files which content is not known. # This is an issue in concurrent mode !') # for fil in files_in_cache : # if fil not in files_in_index : # os.system("rm %"%fil) # else : # Should also remove empty files, as soon as # file creation will be atomic enough # Save index to disk fn = os.path.expanduser(cacheIndexFileName) try: with open(fn, "w") as cacheIndexFile: pickle.dump(crs2filename, cacheIndexFile) dropped_crs = [] except: if update: if os.path.isfile(fn) and len(files_in_cache > 0): clogger.error("Issue when writing cache index %s" % fn)
def cimport(cobject,crs) : clogger.debug("cimport called with argument",cobject) clogger.debug("should check syntax of arg 'crs' -TBD") clogger.warning("cimport is not for the dummies - Playing at your own risks !") import numpy, numpy.ma if isinstance(cobject,numpy.ma.MaskedArray) : clogger.debug("for now, use a file for importing - should revisit - TBD") clogger.error("not yet implemented fro Masked Arrays - TBD") elif isinstance(cobject,str) : cache.register(cobject,crs) else : clogger.error("argument is not a Masked Array nor a filename",cobject)
def generateUniqueFileName(expression, operator=None, format="nc"): """ Generate a filename path from string EXPRESSION and FILEFORMAT, unique for the expression and the set of cache directories currently listed in cache.cachedirs OPERATOR may be a function that provides a prefix, using EXPRESSION This uses hashlib.sha224, which are truncated to 3 (or more) characters. More characters are used if a shorter name is already in use for another expression in one of the known cache directories Generated names drive a structure where each directory name 1 or 2 characters and file names have no more characters Exits if uniqueness is unachievable (quite unexpectable !) """ # import hashlib directoryNameLength=2 # if format==None : return "" prefix="" if operator is not None : prefix2=operator(expression) if prefix2 is not None : prefix=prefix2+"/" full=hashlib.sha224(expression).hexdigest() number=4 guess=full[0 : number - 1 ] existing=searchFile(prefix+stringToPath(guess, directoryNameLength )+"."+format) if existing : readCRS=getCRS(existing) # Update index if needed if readCRS not in crs2filename : clogger.warning("existing data %s in file %s was not yet registered in cache index"%\ (readCRS,existing)) crs2filename[readCRS]=existing while ( ( existing is not None ) and ( readCRS != expression )) : clogger.debug("must skip %s which CRS is %s"%\ (existing, getCRS(existing) )) number += 2 if (number >= len(full) ) : clogger.critical("Critical issue in cache : "+len(full)+" digits is not enough for "+expression) exit guess=full[0 : number - 1 ] existing=searchFile(prefix+stringToPath(guess, directoryNameLength )+"."+format) if existing : readCRS=getCRS(existing) rep=currentCache+"/"+prefix+stringToPath(full[0 : number - 1 ], directoryNameLength )+"."+format rep=os.path.expanduser(rep) # Create the relevant directory, so that user scripts don't have to care dirn=os.path.dirname(rep) if not os.path.exists(dirn) : os.makedirs(dirn) clogger.debug("returning %s"%rep) return(rep)
def efile(obj, filename, forced=False) : """ Create a single file for an ensemble of CliMAF objects (launch computation if needed). This is a convenience function. Such files are not handled in CliMAF cache Args: obj (CliMAF object) : an ensemble of CliMAF objects ('cens' objet) filename (str) : output filename. It will include a field for each ensemble's member, with a variable name suffixed by the member label (e.g. : tas_CNRM-CM, tas_IPSL-CM... ) (more formally : 'var(obj.members[n])'_'obj.labels[n]') forced (logical, optional) : if True, CliMAF will override the file 'filename' if it already exists """ if isinstance(obj,classes.cens) : if os.path.isfile(filename): if forced: os.system("rm -rf %s" %filename) clogger.warning("File '%s' already exists and was overriding" %filename) else: raise Climaf_Driver_Error("File '%s' already exists: use 'forced=True' to override it" %filename) for memb,lab in zip(obj.members,obj.labels): ffile=cfile(memb) f = tempfile.NamedTemporaryFile(suffix=".nc") command="ncrename -O -v %s,%s_%s %s %s"%(varOf(memb), varOf(memb), lab, ffile, f.name) if ( os.system(command) != 0 ) : raise Climaf_Driver_Error("ncrename failed : %s" %command) command2="ncks -A %s %s"%(f.name,filename) if ( os.system(command2) != 0 ) : raise Climaf_Driver_Error("Issue when merging %s and %s (using command: %s)"%(f.name,filename,command2)) f.close() else: clogger.warning("objet is not a 'cens' objet")
def cdrop(obj, rm=True) : """ Deletes the cached file for a CliMAF object, if it exists Args: obj (cobject or string) : object to delete, or its string representation (CRS) rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index Returns: None if object does not exists, False if failing to delete, True if OK Example :: >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> f=cfile(dg) >>> os.system('ls -al '+f) >>> cdrop(dg) """ global crs2filename if (isinstance(obj,cobject) ): crs=`obj` if (isinstance(obj, cdataset) ) : crs="select("+crs+")" elif type(obj) is str : crs=obj else : clogger.error("%s is not a CliMAF object"%`obj`) return if crs in crs2filename : clogger.info("discarding cached value for "+crs) fil=crs2filename.pop(crs) if rm : try : os.remove(fil) return True except: clogger.warning("When trying to remove %s : file does not exist in cache"%crs) return False else : clogger.info("%s is not cached"%crs) return None
def rebuild(): """ Rebuild the in-memory content of CliMAF cache index """ global crs2filename if not stamping: clogger.warning( "Cannot rebuild cache index, because we are not in 'stamping' mode" ) return None files_in_cache = list_cache() crs2filename.clear() for files in files_in_cache: filecrs = getCRS(files) if filecrs: crs2filename[filecrs] = files else: os.system('rm -f ' + files) clogger.warning("File %s is removed" % files) return crs2filename
def selectGenericFiles(urls, **kwargs): """ Allow to describe a ``generic`` file organization : the list of files returned by this function is composed of files which : - match the patterns in ``url`` once these patterns are instantiated by the values in kwargs, and - contain the ``variable`` provided in kwargs - match the `period`` provided in kwargs In the pattern strings, no keyword is mandatory Example : >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*YYYY*.nc)'] /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc In the pattern strings, the keywords that can be used in addition to the argument names (e.g. ${model}) are: - ${variable} : use it if the files are split by variable and filenames do include the variable name, as this speed up the search - YYYY, YYYYMM, YYYYMMDD : use it for indicating the start date of the period covered by each file, if this is applicable in the file naming; use a second time for end date, if applicable (otherwise the assumption is that the whole year -resp. month or day- is included in the file - wildcards '?' and '*' for matching respectively one and any number of characters """ rep=[] period=kwargs['period'] if type(period) is str : period=init_period(period) variable=kwargs['variable'] mustHaveVariable=False if "filenameVar" in kwargs and kwargs['filenameVar'] : kwargs['variable']=kwargs['filenameVar'] mustHaveVariable=True for l in urls : template=Template(l) # There is no use to look for files which path is not specific # to the required variable when we know it should if l.find("${variable}") < 0 and mustHaveVariable : continue # # Instantiate keywords in pattern with attributes values template=template.safe_substitute(**kwargs) #print "template after attributes replace : "+template # # Construct a pattern for globbing dates temp2=template dt=dict(YYYY="????",YYYYMM="??????",YYYYMMDD="????????") for k in dt : temp2=temp2.replace(k,dt[k]) clogger.debug("Globbing on : "+temp2) lfiles=glob.glob(temp2) # # Analyze all filenames for f in lfiles : # print "looking at file"+f # Construct regexp for extracting dates from filename dt=dict(YYYY="([0-9]{4})",YYYYMM="([0-9]{6})", YYYYMMDD="([0-9]{10})") regexp=None # print "template before searching dates : "+template lkeys=dt.keys() ; lkeys.sort(reverse=True) for key in lkeys : # print "searchin "+key+" in "+template start=template.find(key) if (start>=0 ) : # print "found "+key regexp=template.replace(key,dt[key],1) hasEnd=False start=regexp.find(key) if (start >=0 ) : hasEnd=True regexp=regexp.replace(key,dt[key],1) break # # Analyze file time period fperiod=None if regexp : regexp=regexp.replace("*",".*").replace("?",r".") # print "regexp for extracting dates : "+regexp start=re.sub(regexp,r'\1',f) if hasEnd : end=re.sub(regexp,r'\2',f) fperiod=init_period("%s-%s"%(start,end)) else : fperiod=init_period(start) # # Filter file time period against required period else : if ( 'frequency' in kwargs and kwargs['frequency']=="fx") : if (l.find("${variable}")>=0) or fileHasVar(f,variable) : clogger.debug("adding fixed field :"+f) rep.append(f) else : clogger.warning("Cannot yet filter files re. time using only file content. TBD") rep.append(f) if (fperiod and period.intersects(fperiod)) or not regexp : # Filter against variable if (l.find("${variable}")>=0) or fileHasVar(f,variable) : # Should check time period in the file if not regexp # print "appending "+f rep.append(f) return rep
def __init__(self,name, *args, **kwargs) : """ Declare a project and its facets/attributes in CliMAF (see below) Args: name (string) : project name; do not use the chosen separator in it (see below) args (strings) : attribute names; they are free; do not use the chosen separator in it (see below); **CliMAF anyway will add attributes : project, simulation, variable, period, and domain** kwargs (dict) : can only be used with keywords : - ``sep`` or ``separator`` for indicating the symbol separating facets in the dataset syntax. Defaults to ".". - ``ensemble`` for declaring a list of attribute names which are allowed for defining an ensemble in this project ('simulation' is automatically allowed) Returns : a cproject object, which string representation is the pattern later used in CliMAF Refreence Syntax for representing datasets in this project A 'cproject' is the definition of a set of attributes, or facets, which values will completely define a 'dataset' as managed by CliMAF. Its name is one of the possible keys for describing data locations (see :py:class:`~climaf.dataloc.dataloc`) For instance, cproject CMIP5, after its Data Reference Syntax, has attributes : experiment, model, rip (here called simulation), variable, frequency, realm, table, version **A number of projects are built-in**. See :py:mod:`~climaf.projects` A dataset in a cproject declared as :: >>> cproject('MINE','myfreq','myfacet',sep='_') will return :: ${project}_${simulation}_${variable}_${period}_${domain}_${myfreq}_${myfacet} and will have datasets represented as e.g.:: 'MINE_hist_tas_[1980-1999]_global_decadal_gabu' while an example for built-in cproject CMIP5 will be:: 'CMIP5.historical.pr.[1980].global.monthly.CNRM-CM5.r1i1p1.mon.Amon.atmos.last' The attributes list should include all facets which are useful for distinguishing datasets from each other, and for computing datafile pathnames in the 'generic' organization (see :py:class:`~climaf.dataloc.dataloc`) A default value for a given facet can be specified, by providing a tuple (facet_name,default_value) instead of the facet name. This default value is however of lower priority than the value set using :py:func:`cdef` A project can be declared as having non-standard variable names, or variables that should undergo re-scaling; see :py:func:`~climaf.classes.calias` A project can be declared as having non-standard frequency names (this is used when accessing datafiles); see :py:func:`~climaf.classes.cfreqs`) """ if name in cprojects : clogger.warning("Redefining project %s"%name) self.project=name # self.facets=[] self.facet_defaults=dict() forced=['project','simulation', 'variable', 'period', 'domain'] for f in forced : self.facets.append(f) for a in args : if isinstance(a,tuple) : facet_name,facet_default=a self.facet_defaults[facet_name]=facet_default else : facet_name=a if not facet_name in forced : self.facets.append(facet_name) # self.separator="." if "separator" in kwargs : self.separator=kwargs['separator'] if "sep" in kwargs : self.separator=kwargs['sep'] cprojects[name]=self self.crs="" # Build the pattern for the datasets CRS for this cproject for f in self.facets : self.crs += "${%s}%s"%(f,self.separator) self.crs=self.crs[:-1] # Create an attribute hodling the list of facets which are allowed # for defining an ensemble, and put a first facet there self.attributes_for_ensemble=['simulation'] if 'ensemble' in kwargs : self.attributes_for_ensemble.extend(kwargs["ensemble"])
def clim_average_fast(dat, season): """ Computes climatological averages on the annual cycle of a dataset, on the months specified with 'season', either: - the annual mean climatology (season => 'ann','annual','climato','clim','climatology','annual_average','anm') - seasonal climatologies (e.g. season = 'DJF' or 'djf' to compute the seasonal climatology over December-January-February; available seasons: DJF, MAM, JJA, SON, JFM, JAS, JJAS - individual monthly climatologies (e.g. season = 'january', 'jan', '1' or 1 to get the climatological January) - annual maximum or minimum (typically makes sense with the mixed layer depth) Note that you can use upper case or lower case characters to specify the months or seasons. clim_average computes the annual cycle for you. >>> dat= .... # some dataset, with whatever variable >>> climds_JFM = clim_average(dat,'JFM') # The climatology of dat over January-February-March >>> climds_ANM = clim_average(dat,'annual_mean') # The annual mean climatology >>> climds_September = clim_average(dat,'September') # The annual mean climatology of September >>> climds_September = clim_average(dat,9) # Same as previous example, with a float """ # if str(season).lower() in ['ann', 'annual', 'climato', 'clim', 'climatology', 'annual_average', 'anm', 'annual_mean']: avg = time_average_fast(dat) else: # # -- Compute the annual cycle scyc = annual_cycle_fast(dat) # # -- Classic atmospheric seasons selmonths = selmonth = None if str(season).upper() == 'DJF': selmonths = '1,2,12' clogger.warning('DJF is actually processed as JF....D. Maybe an issue for short periods !') if str(season).upper() == 'DJFM': selmonths = '1,2,3,12' if str(season).upper() == 'MAM': selmonths = '3,4,5' if str(season).upper() == 'JJA': selmonths = '6,7,8' if str(season).upper() == 'SON': selmonths = '9,10,11' # -- Classic oceanic seasons if str(season).upper() == 'JFM': selmonths = '1,2,3' if str(season).upper() == 'JAS': selmonths = '7,8,9' if str(season).upper() == 'JJAS': selmonths = '6,7,8,9' # -- Biogeochemistry season if str(season).upper() == 'NDJ': selmonths = '11,12,1' if str(season).upper() == 'AMJ': selmonths = '4,5,6' if selmonths: avg = ccdo_fast(scyc, operator='timmean -seltimestep,' + selmonths) # avg = ccdo(scyc,operator='timmean -selmon,'+selmonths) # # # -- Individual months if str(season).lower() in ['january', 'jan', '1']: selmonth = '1' if str(season).lower() in ['february', 'feb', '2']: selmonth = '2' if str(season).lower() in ['march', 'mar', '3']: selmonth = '3' if str(season).lower() in ['april', 'apr', '4']: selmonth = '4' if str(season).lower() in ['may', '5']: selmonth = '5' if str(season).lower() in ['june', 'jun', '6']: selmonth = '6' if str(season).lower() in ['july', 'jul', '7']: selmonth = '7' if str(season).lower() in ['august', 'aug', '8']: selmonth = '8' if str(season).lower() in ['september', 'sep', '9']: selmonth = '9' if str(season).lower() in ['october', 'oct', '10']: selmonth = '10' if str(season).lower() in ['november', 'nov', '11']: selmonth = '11' if str(season).lower() in ['december', 'dec', '12']: selmonth = '12' if selmonth: avg = ccdo_fast(scyc, operator='selmon,' + selmonth) # # -- Annual Maximum if str(season).lower() in ['max', 'annual max', 'annual_max']: avg = ccdo_fast(scyc, operator='timmax') # # -- Annual Minimum if str(season).lower() in ['min', 'annual min', 'annual_min']: avg = ccdo_fast(scyc, operator='timmin') # return avg
def selectLocalFiles(**kwargs): """ Returns the shortest list of (local) files which include the data for the list of (facet,value) pairs provided Method : - use datalocations indexed by :py:func:`~climaf.dataloc.dataloc` to identify data organization and data store urls for these (facet,value) pairs - check that data organization si sa known one, i.e. is one of 'generic', CMIP5_DRS' or 'EM' - derive relevant filenames search function such as as : py:func:`~climaf.dataloc.selectCmip5DrsFiles` from data organization scheme - pass urls and relevant facet values to this filenames search function """ rep=[] project=kwargs['project'] simulation=kwargs['simulation'] variable=kwargs['variable'] period=kwargs['period'] if 'model' in kwargs : model=kwargs['model'] else : model="*" if 'frequency' in kwargs : frequency=kwargs['frequency'] else : frequency="*" ofu=getlocs(project=project, model=model, simulation=simulation, frequency=frequency) clogger.debug("locs="+ `ofu`) if ( len(ofu) == 0 ) : clogger.warning("no datalocation found for %s %s %s %s "%(project, model, simulation, frequency)) for org,freq,urls in ofu : kwargs2=kwargs.copy() # Convert normalized frequency to project-specific frequency if applicable if "frequency" in kwargs and project in classes.frequencies : normfreq=kwargs2['frequency'] if normfreq in classes.frequencies[project]: kwargs2['frequency']=classes.frequencies[project][normfreq] # # Call organization-specific routine if (org == "EM") : rep.extend(selectEmFiles(**kwargs2)) elif (org == "CMIP5_DRS") : rep.extend(selectCmip5DrsFiles(urls,**kwargs2)) elif (org == "generic") : rep.extend(selectGenericFiles(urls, **kwargs2)) else : raise Climaf_Data_Error("cannot process organization "+org+ \ " for simulation "+simulation+" and model "+model+\ " of project "+project) if (not ofu) : return None else : if (len(rep) == 0 ) : clogger.warning("no file found for %s, at these " "data locations %s "%(`kwargs` , `urls`)) return None # Discard duplicates (assumes that sorting is harmless for later processing) rep.sort() last=None for f in rep : if f == last : rep.remove(last) last=f # Assemble filenames in one single string return(string.join(rep))
def __init__(self,name, command, format="nc", canOpendap=False, commuteWithTimeConcatenation=False, commuteWithSpaceConcatenation=False, **kwargs): """ Declare a script or binary as a 'CliMAF operator', and define a Python function with the same name Args: name (str): name for the CliMAF operator. command (str): script calling sequence, according to the syntax described below. format (str): script outputs format -- either 'nc' or 'png' or 'None'; defaults to 'nc' canOpendap (bool, optional): is the script able to use OpenDAP URIs ? default to False commuteWithTimeConcatenation (bool, optional): can the operation commute with concatenation of time periods ? set it to true, if the operator can be applied on time chunks separately, in order to allow for incremental computation / time chunking; defaults to False commuteWithSpaceConcatenation (bool, optional): can the operation commute with concatenation of space domains ? defaults to False (see commuteWithTimeConcatenation) **kwargs : possible keyword arguments, with keys matching '<outname>_var', for providing a format string allowing to compute the variable name for output 'outname' (see below). Returns: None The script calling sequence pattern string (arg 'command') indicates how to build the system call which actually launches the script, with a match between python objects and formal arguments; For introducing the syntax, please consider this example, with the following commands:: >>> cscript('mycdo','cdo ${operator} ${in} ${out}') >>> # define some dataset >>> tas_ds = ds(project='example', simulation='AMIPV6', variable='tas', period='1980-1981') >>> # Apply operator 'mycdo' to dataset 'tas_ds', choosing a given 'operator' argument >>> tas_avg = mycdo(tas_ds,operator='timavg') CliMAF will later on launch this call behind the curtain:: $ cdo tim_avg /home/my/tmp/climaf_cache/8a/5.nc /home/my/tmp/climaf_cache/4e/4.nc where : - the last filename is generated by CliMAF from the formal exprerssion describing 'tas_avg' - the first filename provide a file generated by CliMAF which includes the required data fot tas_ds There are a number of examples in module :download:`standard_operators <../climaf/standard_operators.py>`. **Detailed syntax**: - formal arguments appear as : ``${argument}`` (in the example : ``${in}``, ``${out}``, ``${operator}`` ) - except for reserved keywords, arguments in the pattern will be replaced by the values for corresponding keywords used when invoking the diagnostic operator: - in the example above : argument ``operator`` is replaced by value ``timavg``, which is a keyword known to the external binary called, CDO - reserved argument keywords are : - **in, in_<digit>, ins, ins_<digit>, mmin** : they will be replaced by CliMAF managed filenames for input data, as deduced from dataset description or upstream computation; these filenames can actually be remote URLs (if the script can use OpenDAP, see args), local 'raw' data files, or CliMAF cache filenames - **in** stands for the URL of the first dataset invoked in the operator call - **in_<digit>** stands for the next ones, in the same order - **ins** and **ins_<digit>** stand for the case where the script can select input from multiple input files or URLs (e.g. when the whole period to process spans over multiple files); in that case, a single string (surrounded with double quotes) will carry multiple URLs - **mmin** stands for the case where the script accepts an ensemble of datasets (only for first input stream yet). CliMAF will replace the keyword by a string composed of the corresponding input filenames (not surrounded by quotes - please add them yourself in declaration); see also ``labels`` below - **var, var_<digit>** : when a script can select a variable in a multi-variable input stream, this is declared by adding this keyword in the calling sequence; CliMAF will replace it by the actual variable name to process; 'var' stands for first input stream, 'var_<digit>' for the next ones; - in the example above, we assume that external binary CDO is not tasked with selecting the variable, and that CliMAF must feed CDO with a datafile where it has already performed the selection - **period, period_<digit>** : when a script can select a time period in the content of a file or stream, it should declare it by putting this keyword in the pattern, which will be replaced at call time by the period written as <date1>-<date2>, where date is formated as YYYYMMDD ; - time intervals must be interpreted as [date1, date2[ - 'period' stands for the first input_stream, - 'period_<n>' for the next ones, in the order of actual call; - in the example above, this keyword is not used, which means that CliMAF has to select the period upstream of feeding CDO with the data - **period_iso, period_iso_<digit>** : as for **period** above, except that the date formating fits CDO conventions : - date format is ISO : YYYY-MM-DDTHH:MM:SS - interval is [date1,date2_iso], where date2_iso is 1 minute before date2 - separator between dates is : , - **domain, domain_<digit>** : when a script can select a domain in the input grid, this is declared by adding this keyword in the calling sequence; CliMAF will replace it by the domain definition if needed, as 'latmin,latmax,lonmin,lonmax' ; 'domain' stands for first input stream, 'domain_<digit>' for the next ones : - in the example above, we assume that external binary CDO is not tasked with selecting the domain, and that CliMAF must feed CDO with a datafile where it has already performed the selection - **out, out_<word>** : CliMAF provide file names for output files (if there is no such field, the script will have only 'side effects', e.g. launch a viewer). Main output file must be created by the script with the name provided at the location of argument ${out}. Using arguments like 'out_<word>' tells CliMAF that the script provide some secondary output, which will be symbolically known in CliMAF syntax as an attribute of the main object; by default, the variable name of each output equals the name of the output (except for the main ouput, which variable name is supposed to be the same as for the first input); for other cases, see argument \*\*kwargs to provide a format string, used to derive the variable name from first input variable name as in e.g. : ``output2_var='std_dev(%s)'`` for the output labelled output2 (i.e. declared as '${out_output2}') - in the example above, we just apply the convention used by CDO, which expects that you provide an output filename as last argument on the command line. See example mean_and_sdev in doc for advanced usage. - **crs** : will be replaced by the CliMAF Reference Syntax expression describing the first input stream; can be useful for plot title or legend - **alias** : means that the script can make an on the fly re-scaling and renaming of a variable. Will be replaced by a string which pattern is : 'new_varname,file_varname,scale,offset'. The script should then transform on reading as new_varname = file_varname * scale + offset - **units, units_<digit>** : means that the script can set the units on-the-fly while reading one of the input streams - **missing** : means that the script can make an on-the-fly transformation of a givent constant to missing values - **labels** : for script accepting ensembles, CliMAF will replace this keyword by a string bearing the labels associated with the ensemble, with delimiter $ as e.g. in: "CNRM-CM5 is fine$IPSL-CM5-LR is not bad$CCSM-29 is ..." """ # Check that script name do not clash with an existing symbol if name in sys.modules['__main__'].__dict__ and name not in scripts : clogger.error("trying to define %s as an operator, " "while it exists as smthing else"%name) return None if name in scripts : clogger.warning("Redefining CliMAF script %s"%name) # # Check now that script is executable scriptcommand=command.split(' ')[0].replace("(","") ex=subprocess.Popen(['which',scriptcommand], stdout=subprocess.PIPE) if ex.wait() != 0 : Climaf_Operator_Error("defining %s : command %s is not " "executable"%(name,scriptcommand)) executable=ex.stdout.read().replace('\n','') # # Analyze inputs field keywords and populate dict # attribute 'inputs' with some properties self.inputs=dict() commuteWithEnsemble=True it=re.finditer( r"\${(?P<keyw>(?P<mult>mm)?in(?P<serie>s)?(_(?P<n>([\d]+)))?)}", command) for oc in it : if (oc.group("n") is not None) : rank=int(oc.group("n")) else : rank=0 if rank in self.inputs : Climaf_Operator_Error( "When defining %s : duplicate declaration for input #%d"%\ (name,rank)) serie=(oc.group("serie") is not None) multiple=(oc.group("mult") is not None) if multiple : if rank != 0 : raise Climaf_Operator_Error( "Only first operand may accept members") if serie : raise Climaf_Operator_Error( "Operand %s cannot both accept" "members and files set"%oc.group("keyw")) commuteWithEnsemble=False self.inputs[rank]=(oc.group("keyw"),multiple,serie) if len(self.inputs)==0 : Climaf_Operator_Error( "When defining %s : command %s must include at least one of " "${in} ${ins} ${mmin} or ${in_..} ... for specifying how CliMAF" " will provide the input filename(s)"% (name,command)) #print self.inputs for i in range(len(self.inputs)) : if i+1 not in self.inputs and not ( i == 0 and 0 in self.inputs) : Climaf_Operator_Error( "When defining %s : error in input sequence for rank %d"%\ (name,i+1)) # # Check if command includes an argument allowing for # providing an output filename if command.find("${out") < 0 : format=None # # Search in call arguments for keywords matching "<output_name>_var" # which may provide format string for 'computing' outputs variable # name from input variable name outvarnames=dict() ; pattern=r"^(.*)_var$" for p in kwargs : if re.match(pattern,p): outvarnames[re.findall(pattern,p)[0]]=kwargs[p] #clogger.debug("outvarnames = "+`outvarnames`) # # Analyze outputs names , associated variable names # (or format strings), and store it in attribute dict 'outputs' self.outputs=dict() it=re.finditer(r"\${out(_(?P<outname>[\w-]*))?}",command) for occ in it : outname=occ.group("outname") if outname is not None : if (outname in outvarnames) : self.outputs[outname]=outvarnames[outname] else : self.outputs[outname]=outname else: self.outputs[None]="%s" #clogger.debug("outputs = "+`self.outputs`) # canSelectVar= (command.find("${var}") > 0 ) canAggregateTime=(command.find("${ins}") > 0 or command.find("${ins_1}") > 0) canAlias= (command.find("${alias}") > 0 ) canMissing= (command.find("${missing}") > 0 ) canSelectTime=False if command.find("${period}") > 0 or command.find("${period_1}") > 0 : canSelectTime=True if command.find("${period_iso}") > 0 or command.find("${period_iso_1}") > 0 : canSelectTime=True canSelectDomain=(command.find("${domain}") > 0 or command.find("${domain_1}") > 0) # self.name=name self.command=command self.flags=scriptFlags(canOpendap, canSelectVar, canSelectTime, \ canSelectDomain, canAggregateTime, canAlias, canMissing,\ commuteWithEnsemble,\ commuteWithTimeConcatenation, commuteWithSpaceConcatenation ) self.outputFormat=format scripts[name]=self # Init doc string for the operator doc="CliMAF wrapper for command : %s"%self.command # try to get a better doc string from colocated doc/directory docfilename=os.path.dirname(__file__)+"/../doc/scripts/"+name+".rst" #print "docfilen= "+docfilename try: docfile=open(docfilename) doc=docfile.read() docfile.close() except: pass # # creates a function named as requested, which will invoke # capply with that name and same arguments defs='def %s(*args,**dic) :\n """%s"""\n return driver.capply("%s",*args,**dic)\n'\ % (name,doc,name) exec defs in globals() # exec "from climaf.operators import %s"%name in \ sys.modules['__main__'].__dict__ clogger.debug("CliMAF script %s has been declared"%name)
def derive(project, derivedVar, Operator, *invars, **params) : """ Define that 'derivedVar' is a derived variable in 'project', computed by applying 'Operator' to input streams which are datasets whose variable names take the values in ``*invars`` and the parameter/arguments of Operator take the values in ``**params`` 'project' may be the wildcard : '*' Example , assuming that operator 'minus' has been defined as :: >>> cscript('minus','cdo sub ${in_1} ${in_2} ${out}') which means that ``minus`` uses CDO for substracting the two datasets; you may define, for a given project 'CMIP5', a new variable e.g. for cloud radiative effect at the surface, named 'rscre', using the difference of values of all-sky and clear-sky net radiation at the surface by:: >>> derive('CMIP5', 'rscre','minus','rs','rscs') You may then use this variable name at any location you would use any other variable name Note : you may use wildcard '*' for the project Another example is rescaling or renaming some variable; here, let us define how variable 'ta' can be derived from ERAI variable 't' : >>> derive('erai', 'ta','rescale', 't', scale=1., offset=0.) **However, this is not the most efficient way to do that**. See :py:func:`~climaf.classes.calias()` Expert use : argument 'derivedVar' may be a dictionary, which keys are derived variable names and values are scripts outputs names; example :: >>> cscript('vertical_interp', 'vinterp.sh ${in} surface_pressure=${in_2} ${out_l500} ${out_l850} method=${opt}') >>> derive('*', {'z500' : 'l500' , 'z850' : 'l850'},'vertical_interp', 'zg', 'ps', opt='log'} """ # Action : register the information in a dedicated dict which keys # are single derived variable names, and which will be used at the # object evaluation step # Also : some consistency checks w.r.t. script definition if Operator in scripts : if not isinstance(derivedVar,dict) : derivedVar=dict(out=derivedVar) for outname in derivedVar : if (outname != 'out' and (not getattr(Operator,"outvarnames",None) or outname not in Operator.outvarnames )): raise Climaf_Operator_Error( "%s is not a named ouput for operator %s; type help(%s)"%\ (outname,Operator,Operator)) s=scripts[Operator] if s.inputs_number() != len(invars) : clogger.error("number of input variables for operator" "%s is %d, which is inconsistent with " "script declaration : %s"\ %(s.name,len(invars),s.command)) return # TBD : check parameters number ( need to build # its list in cscript.init() ) if project not in derived_variables : derived_variables[project]=dict() derived_variables[project][derivedVar[outname]]=(Operator, outname, list(invars), params) elif Operator in operators : clogger.warning("Cannot yet handle derived variables based on internal operators") else : clogger.error("second argument must be a script or operator, already declared")
def __init__(self, name, command, format="nc", canOpendap=False, commuteWithTimeConcatenation=False, commuteWithSpaceConcatenation=False, canSelectVar=False, **kwargs): """ Declare a script or binary as a 'CliMAF operator', and define a Python function with the same name Args: name (str): name for the CliMAF operator. command (str): script calling sequence, according to the syntax described below. format (str): script outputs format -- either 'nc', 'png', 'pdf', 'eps', 'None' or 'graph' ('graph' allows to the user to choose three different graphic output formats: 'png', 'pdf' or 'eps') or 'txt' (the text output are not managed by CliMAF, but only displayed - 'txt' allows to use e.g. 'ncdump -h' from inside CliMAF); defaults to 'nc' canOpendap (bool, optional): is the script able to use OpenDAP URIs ? default to False commuteWithTimeConcatenation (bool, optional): can the operation commute with concatenation of time periods ? set it to true, if the operator can be applied on time chunks separately, in order to allow for incremental computation / time chunking; defaults to False commuteWithSpaceConcatenation (bool, optional): can the operation commute with concatenation of space domains ? defaults to False (see commuteWithTimeConcatenation) **kwargs : possible keyword arguments, with keys matching '<outname>_var', for providing a format string allowing to compute the variable name for output 'outname' (see below). Returns: None The script calling sequence pattern string (arg 'command') indicates how to build the system call which actually launches the script, with a match between python objects and formal arguments; For introducing the syntax, please consider this example, with the following commands:: >>> cscript('mycdo','cdo ${operator} ${in} ${out}') >>> # define some dataset >>> tas_ds = ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> # Apply operator 'mycdo' to dataset 'tas_ds', choosing a given 'operator' argument >>> tas_avg = mycdo(tas_ds,operator='timavg') CliMAF will later on launch this call behind the curtain:: $ cdo tim_avg /home/my/tmp/climaf_cache/8a/5.nc /home/my/tmp/climaf_cache/4e/4.nc where : - the last filename is generated by CliMAF from the formal expression describing 'tas_avg', and will receive the result - the first filename provides a file generated by CliMAF which includes the data required for tas_ds There are a number of examples declared in module :download:`standard_operators <../climaf/standard_operators.py>`. **Detailed syntax**: - formal arguments appear as : ``${argument}`` (in the example : ``${in}``, ``${out}``, ``${operator}`` ) - except for reserved keywords, arguments in the pattern will be replaced by the values for corresponding keywords used when invoking the diagnostic operator: - in the example above : argument ``operator`` is replaced by value ``timavg``, which is a keyword known to the external binary called, CDO - reserved argument keywords are : - **in, in_<digit>, ins, ins_<digit>, mmin** : they will be replaced by CliMAF managed filenames for input data, as deduced from dataset description or upstream computation; these filenames can actually be remote URLs (if the script can use OpenDAP, see args), local 'raw' data files, or CliMAF cache filenames - **in** stands for the URL of the first dataset invoked in the operator call - **in_<digit>** stands for the next ones, in the same order - **ins** and **ins_<digit>** stand for the case where the script can select input from multiple input files or URLs (e.g. when the whole period to process spans over multiple files); in that case, a single string (surrounded with double quotes) will carry multiple URLs - **mmin** stands for the case where the script accepts an ensemble of datasets (only for first input stream yet). CliMAF will replace the keyword by a string composed of the corresponding input filenames (not surrounded by quotes - please add them yourself in declaration); see also ``labels`` below - **var, var_<digit>** : when a script can select a variable in a multi-variable input stream, this is declared by adding this keyword in the calling sequence; CliMAF will replace it by the actual variable name to process; 'var' stands for first input stream, 'var_<digit>' for the next ones; - in the example above, we assume that external binary CDO is not tasked with selecting the variable, and that CliMAF must feed CDO with a datafile where it has already performed the selection - **period, period_<digit>** : when a script can select a time period in the content of a file or stream, it should declare it by putting this keyword in the pattern, which will be replaced at call time by the period written as <date1>-<date2>, where date is formated as YYYYMMDD ; - time intervals must be interpreted as [date1, date2[ - 'period' stands for the first input_stream, - 'period_<n>' for the next ones, in the order of actual call; - in the example above, this keyword is not used, which means that CliMAF has to select the period upstream of feeding CDO with the data - **period_iso, period_iso_<digit>** : as for **period** above, except that the date formating fits CDO conventions : - date format is ISO : YYYY-MM-DDTHH:MM:SS - interval is [date1,date2_iso], where date2_iso is 1 minute before date2 - separator between dates is : , - **domain, domain_<digit>** : when a script can select a domain in the input grid, this is declared by adding this keyword in the calling sequence; CliMAF will replace it by the domain definition if needed, as 'latmin,latmax,lonmin,lonmax' ; 'domain' stands for first input stream, 'domain_<digit>' for the next ones : - in the example above, we assume that external binary CDO is not tasked with selecting the domain, and that CliMAF must feed CDO with a datafile where it has already performed the selection - **out, out_<word>** : CliMAF provide file names for output files (if there is no such field, the script will have only 'side effects', e.g. launch a viewer). Main output file must be created by the script with the name provided at the location of argument ${out}. Using arguments like 'out_<word>' tells CliMAF that the script provide some secondary output, which will be symbolically known in CliMAF syntax as an attribute of the main object; by default, the variable name of each output equals the name of the output (except for the main ouput, which variable name is supposed to be the same as for the first input); for other cases, see argument \*\*kwargs to provide a format string, used to derive the variable name from first input variable name as in e.g. : ``output2_var='std_dev(%s)'`` for the output labelled output2 (i.e. declared as '${out_output2}') or ``_var='std_dev(%s)'`` for the default (main) output - in the example above, we just apply the convention used by CDO, which expects that you provide an output filename as last argument on the command line. See example mean_and_sdev in doc for advanced usage. - **crs** : will be replaced by the CliMAF Reference Syntax expression describing the first input stream; can be useful for plot title or legend - **alias** : used if the script can make an on the fly re-scaling and renaming of a variable. Will be replaced by a string which pattern is : 'new_varname,file_varname,scale,offset'. The script should then transform on reading as new_varname = file_varname * scale + offset - **units, units_<digit>** : means that the script can set the units on-the-fly while reading one of the input streams - **missing** : means that the script can make an on-the-fly transformation of a given constant to missing values - **labels** : for script accepting ensembles, CliMAF will replace this keyword by a string bearing the labels associated with the ensemble, with delimiter $ as e.g. in: "CNRM-CM5 is fine$IPSL-CM5-LR is not bad$CCSM-29 is ..." """ # Check that script name do not clash with an existing symbol if name in sys.modules['__main__'].__dict__ and name not in scripts: clogger.error("trying to define %s as an operator, " "while it exists as smthing else" % name) return None if name in scripts: clogger.warning("Redefining CliMAF script %s" % name) # # Check now that script is executable scriptcommand = command.split(' ')[0].replace("(", "") ex = subprocess.Popen(['which', scriptcommand], stdout=subprocess.PIPE) if ex.wait() != 0: Climaf_Operator_Error("defining %s : command %s is not " "executable" % (name, scriptcommand)) executable = ex.stdout.read().replace('\n', '') # # Analyze inputs field keywords and populate dict # attribute 'inputs' with some properties self.inputs = dict() commuteWithEnsemble = True it = re.finditer( r"\${(?P<keyw>(?P<mult>mm)?in(?P<serie>s)?(_(?P<n>([\d]+)))?)}", command) for oc in it: if oc.group("n") is not None: rank = int(oc.group("n")) else: rank = 0 if rank in self.inputs: Climaf_Operator_Error( "When defining %s : duplicate declaration for input #%d" % (name, rank)) serie = (oc.group("serie") is not None) multiple = (oc.group("mult") is not None) if multiple: if rank != 0: raise Climaf_Operator_Error( "Only first operand may accept members") if serie: raise Climaf_Operator_Error("Operand %s cannot both accept" "members and files set" % oc.group("keyw")) commuteWithEnsemble = False self.inputs[rank] = (oc.group("keyw"), multiple, serie) if len(self.inputs) == 0: Climaf_Operator_Error( "When defining %s : command %s must include at least one of " "${in} ${ins} ${mmin} or ${in_..} ... for specifying how CliMAF" " will provide the input filename(s)" % (name, command)) # print self.inputs for i in range(len(self.inputs)): if i + 1 not in self.inputs and not (i == 0 and 0 in self.inputs): Climaf_Operator_Error( "When defining %s : error in input sequence for rank %d" % (name, i + 1)) # # Check if command includes an argument allowing for # providing an output filename if command.find("${out") < 0: if format is not "txt": format = None # # Search in call arguments for keywords matching "<output_name>_var" # which may provide format string for 'computing' outputs variable # name from input variable name outvarnames = dict() pattern = r"^(.*)_var$" for p in kwargs: if re.match(pattern, p): outvarnames[re.findall(pattern, p)[0]] = kwargs[p] clogger.debug("outvarnames for script %s = %s" % (name, repr(outvarnames))) # # Analyze outputs names , associated variable names # (or format strings), and store it in attribute dict 'outputs' self.outputs = dict() it = re.finditer(r"\${out(_(?P<outname>[\w-]*))?}", command) for occ in it: outname = occ.group("outname") if outname is not None: if outname in outvarnames: self.outputs[outname] = outvarnames[outname] else: self.outputs[outname] = "%s" # outname else: self.outputs[None] = outvarnames.get('', "%s") self.outputs[''] = outvarnames.get('', "%s") # clogger.debug("outputs = "+`self.outputs`) # canSelectVar = canSelectVar or (command.find("${var}") > 0) canAggregateTime = (command.find("${ins}") > 0 or command.find("${ins_1}") > 0) canAlias = (command.find("${alias}") > 0) canMissing = (command.find("${missing}") > 0) canSelectTime = False if command.find("${period}") > 0 or command.find("${period_1}") > 0: canSelectTime = True if command.find("${period_iso}") > 0 or command.find( "${period_iso_1}") > 0: canSelectTime = True canSelectDomain = (command.find("${domain}") > 0 or command.find("${domain_1}") > 0) # self.name = name self.command = command self.fixedfields = None self.flags = scriptFlags(canOpendap, canSelectVar, canSelectTime, canSelectDomain, canAggregateTime, canAlias, canMissing, commuteWithEnsemble, commuteWithTimeConcatenation, commuteWithSpaceConcatenation) if format in known_formats or format in graphic_formats or format in none_formats: self.outputFormat = format else: raise Climaf_Operator_Error( "Allowed formats yet are : 'object', 'nc', 'txt', %s" % ', '.join([repr(x) for x in graphic_formats])) scripts[name] = self # Init doc string for the operator doc = "CliMAF wrapper for command : %s" % self.command # try to get a better doc string from colocated doc/directory docfilename = os.path.dirname( __file__) + "/../doc/scripts/" + name + ".rst" # print "docfilen= "+docfilename try: docfile = open(docfilename) doc = docfile.read() docfile.close() except: pass # # creates a function named as requested, which will invoke # capply with that name and same arguments defs = 'def %s(*args,**dic) :\n """%s"""\n return driver.capply("%s",*args,**dic)\n' \ % (name, doc, name) exec defs in globals() # exec "from climaf.operators import %s" % name in \ sys.modules['__main__'].__dict__ clogger.debug("CliMAF script %s has been declared" % name)
def selectGenericFiles(urls, **kwargs): """ Allow to describe a ``generic`` file organization : the list of files returned by this function is composed of files which : - match the patterns in ``url`` once these patterns are instantiated by the values in kwargs, and - contain the ``variable`` provided in kwargs - match the `period`` provided in kwargs In the pattern strings, no keyword is mandatory Example : >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*YYYY*.nc)'] /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc In the pattern strings, the keywords that can be used in addition to the argument names (e.g. ${model}) are: - ${variable} : use it if the files are split by variable and filenames do include the variable name, as this speed up the search - YYYY, YYYYMM, YYYYMMDD : use it for indicating the start date of the period covered by each file, if this is applicable in the file naming; use a second time for end date, if applicable (otherwise the assumption is that the whole year -resp. month or day- is included in the file - wildcards '?' and '*' for matching respectively one and any number of characters """ rep=[] period=kwargs['period'] if type(period) is str : period=init_period(period) variable=kwargs['variable'] altvar=kwargs.get('filenameVar',variable) # a dict and an ordered list of date globbing patterns dt=dict(YYYY="????",YYYYMM="??????",YYYYMMDD="????????") lkeys=dt.keys() ; lkeys.sort(reverse=True) # a dict and an ordered list for matching dates dr=dict(YYYY="([0-9]{4})",YYYYMM="([0-9]{6})", YYYYMMDD="([0-9]{8})") rkeys=dr.keys() ; rkeys.sort(reverse=True) # for l in urls : # Instantiate keywords in pattern with attributes values template=Template(l).safe_substitute(**kwargs) #print "template after attributes replace : "+template # # Construct a pattern for globbing dates temp2=template ; for k in lkeys : temp2=temp2.replace(k,dt[k]) lfiles=glob.glob(temp2) clogger.debug("Globbing %d files for varname on %s : "%(len(lfiles),temp2)) # # If unsuccessful using varname, try with filenameVar if len(lfiles)==0 and "filenameVar" in kwargs and kwargs['filenameVar'] : kwargs['variable']=kwargs['filenameVar'] template=Template(l).safe_substitute(**kwargs) temp2=template for k in lkeys : temp2=temp2.replace(k,dt[k]) # lfiles=glob.glob(temp2) clogger.debug("Globbing %d files for filenamevar on %s: "%(len(lfiles),temp2)) # Construct regexp for extracting dates from filename regexp=None #print "template before searching dates : "+template for key in rkeys : #print "searchin "+key+" in "+=Template(l) start=template.find(key) if (start>=0 ) : #print "found "+key regexp=template.replace(key,dr[key],1) hasEnd=False start=regexp.find(key) if (start >=0 ) : hasEnd=True regexp=regexp.replace(key,dr[key],1) break #print "regexp before searching dates : "+regexp # for f in lfiles : #print "processing file "+f # # Analyze file time period fperiod=None if regexp : regexp0=regexp.replace("*",".*").replace("?",r".") #print "regexp for extracting dates : "+regexp start=re.sub(regexp0,r'\1',f) if start==f: raise Climaf_Data_Error("Start period not found") #? LV if hasEnd : end=re.sub(regexp0,r'\2',f) fperiod=init_period("%s-%s"%(start,end)) else : fperiod=init_period(start) #print "period for file %s is %s"%(f,fperiod) # # Filter file time period against required period else : if ( 'frequency' in kwargs and ((kwargs['frequency']=="fx") or \ kwargs['frequency']=="seasonnal" or kwargs['frequency']=="annual_cycle" )) : if (l.find("${variable}")>=0) or fileHasVar(f,variable) or fileHasVar(f,altvar) : clogger.debug("adding fixed field :"+f) rep.append(f) else : clogger.warning("Cannot yet filter files re. time using only file content. TBD") rep.append(f) if (fperiod and period.intersects(fperiod)) or not regexp : clogger.debug('Period is OK - Considering variable filtering on %s and %s for %s'%(variable,altvar,f)) # Filter against variable if (l.find("${variable}")>=0): clogger.debug('appending %s based on variable in filename'%f) rep.append(f) continue if f not in rep and ( fileHasVar(f,variable) or fileHasVar(f,altvar) or ("," in variable)): # Should check time period in the file if not regexp clogger.debug('appending %s based on multi-var or var exists in file '%f) rep.append(f) else: if not fperiod : clogger.debug('not appending %s because period is None '%f) else: if not period.intersects(fperiod) : clogger.debug('not appending %s because period doesn t intersect %s'%(f,period)) return rep
def selectFiles(return_wildcards=None, merge_periods_on=None, **kwargs): """ Returns the shortest list of (local or remote) files which include the data for the list of (facet,value) pairs provided Method : - use datalocations indexed by :py:func:`~climaf.dataloc.dataloc` to identify data organization and data store urls for these (facet,value) pairs - check that data organization is as known one, i.e. is one of 'generic', CMIP5_DRS' or 'EM' - derive relevant filenames search function such as as : py:func:`~climaf.dataloc.selectCmip5DrsFiles` from data organization scheme - pass urls and relevant facet values to this filenames search function """ rep=[] project=kwargs['project'] simulation=kwargs['simulation'] if 'model' in kwargs : model=kwargs['model'] else : model="*" if 'frequency' in kwargs : frequency=kwargs['frequency'] else : frequency="*" ofu=getlocs(project=project, model=model, simulation=simulation, frequency=frequency) clogger.debug("locs="+ `ofu`) if ( len(ofu) == 0 ) : clogger.warning("no datalocation found for %s %s %s %s "%(project, model, simulation, frequency)) for org,freq,urls in ofu : if return_wildcards is not None and org is not "generic" : raise classes.Climaf_Error("Can hanle multipe facet query only for organization=generic ") kwargs2=kwargs.copy() # Convert normalized frequency to project-specific frequency if applicable if "frequency" in kwargs and project in classes.frequencies : normfreq=kwargs2['frequency'] if normfreq in classes.frequencies[project]: kwargs2['frequency']=classes.frequencies[project][normfreq] # JS # Convert normalized realm to project-specific realm if applicable if "realm" in kwargs and project in classes.realms : normrealm=kwargs2['realm'] if normrealm in classes.realms[project]: kwargs2['realm']=classes.realms[project][normrealm] # # Call organization-specific routine if (org == "EM") : rep.extend(selectEmFiles(**kwargs2)) elif (org == "CMIP5_DRS") : rep.extend(selectCmip5DrsFiles(urls,**kwargs2)) elif (org == "generic") : rep.extend(selectGenericFiles(urls, return_wildcards=return_wildcards, \ merge_periods_on=merge_periods_on,**kwargs2)) else : raise classes.Climaf_Error("Cannot process organization "+org+ \ " for simulation "+simulation+" and model "+model+\ " of project "+project) if (not ofu) : return None else : if (len(rep) == 0 ) : clogger.warning("no file found for %s, at these " "data locations %s "%(`kwargs` , `urls`)) if any([ kwargs[k] == '' for k in kwargs ]) : clogger.warning("Please check these empty attributes %s"%\ [ k for k in kwargs if kwargs[k]=='' ]) return None # Discard duplicates (assumes that sorting is harmless for later processing) rep.sort() last=None for f in rep : if f == last : rep.remove(last) last=f # Assemble filenames in one single string return(string.join(rep))
def derive(project, derivedVar, Operator, *invars, **params): """ Define that 'derivedVar' is a derived variable in 'project', computed by applying 'Operator' to input streams which are datasets whose variable names take the values in ``*invars`` and the parameter/arguments of Operator take the values in ``**params`` 'project' may be the wildcard : '*' Example, assuming that operator 'minus' has been defined as :: >>> cscript('minus','cdo sub ${in_1} ${in_2} ${out}') which means that ``minus`` uses CDO for substracting the two datasets; you may define, for a given project 'CMIP5', a new variable e.g. for cloud radiative effect at the surface, named 'rscre', using the difference of values of all-sky and clear-sky net radiation at the surface by:: >>> derive('CMIP5', 'rscre','minus','rs','rscs') You may then use this variable name at any location you would use any other variable name Note : you may use wildcard '*' for the project Another example is rescaling or renaming some variable; here, let us define how variable 'ta' can be derived from ERAI variable 't' : >>> derive('erai', 'ta','rescale', 't', scale=1., offset=0.) **However, this is not the most efficient way to do that**. See :py:func:`~climaf.classes.calias()` Expert use : argument 'derivedVar' may be a dictionary, which keys are derived variable names and values are scripts outputs names; example :: >>> cscript('vertical_interp', 'vinterp.sh ${in} surface_pressure=${in_2} ${out_l500} ${out_l850} method=${opt}') >>> derive('*', {'z500' : 'l500' , 'z850' : 'l850'},'vertical_interp', 'zg', 'ps', opt='log'} """ # Action : register the information in a dedicated dict which keys # are single derived variable names, and which will be used at the # object evaluation step # Also : some consistency checks w.r.t. script definition if Operator in scripts: if not isinstance(derivedVar, dict): derivedVar = dict(out=derivedVar) for outname in derivedVar: if (outname != 'out' and (not getattr(Operator, "outvarnames", None) or outname not in Operator.outvarnames)): raise Climaf_Operator_Error( "%s is not a named ouput for operator %s; type help(%s)" % (outname, Operator, Operator)) s = scripts[Operator] if s.inputs_number() != len(invars): clogger.error( "number of input variables for operator %s is %d, which is inconsistent with " "script declaration : %s" % (s.name, len(invars), s.command)) return # TBD : check parameters number ( need to build # its list in cscript.init() ) if project not in derived_variables: derived_variables[project] = dict() derived_variables[project][derivedVar[outname]] = (Operator, outname, list(invars), params) elif Operator in operators: clogger.warning( "Cannot yet handle derived variables based on internal operators") else: clogger.error( "second argument (%s) must be a script or operator, already declared" % repr(Operator))