def ds(*args,**kwargs) : """ Returns a dataset from its full Climate Reference Syntax string. Example :: >>> ds('CMIP5.historical.pr.[1980].global.monthly.CNRM-CM5.r1i1p1.mon.Amon.atmos.last') Also a shortcut for :py:meth:`~climaf.classes.cdataset`, when used with with only keywords arguments. Example :: >>> cdataset(project='CMIP5', model='CNRM-CM5', experiment='historical', frequency='monthly',\ simulation='r2i3p9', domain=[40,60,-10,20], variable='tas', period='1980-1989', version='last') """ if len(args) >1 : raise Climaf_Classes_Error("Must provide either only a string or only keyword arguments") #clogger.debug("Entering , with args=%s, kwargs=%s"%(`args`,`kwargs`)) if (len(args)==0) : return cdataset(**kwargs) # Front-end to cdataset crs=args[0] results=[] for cproj in cprojects : try : dataset = cprojects[cproj].crs2ds(crs) except Climaf_Classes_Error: dataset=None if (dataset) : results.append(dataset) if len(results) > 1 : e="CRS expression %s is ambiguous among projects %s"%(crs,`cprojects.keys()`) if allow_errors_on_ds_call : clogger.info(e) else : raise Climaf_Classes_Error(e) elif len(results) == 0 : e="CRS expression %s is not valid for any project in %s"%(crs,`cprojects.keys()`) if allow_errors_on_ds_call : clogger.info(e) else : raise Climaf_Classes_Error(e) else : rep=results[0] if rep.project=='file' : rep.files=rep.kvp["path"] return rep
def register(filename,crs): """ Adds in FILE a metadata named 'CRS_def' and with value CRS, and a metadata 'CLiMAF' with CliMAF version and ref URL Records this FILE in dict crs2filename Silently skip non-existing files """ # First read index from file if it is yet empty - No : done at startup # if len(crs2filename.keys()) == 0 : cload() # It appears that we have to let some time to the file system for updating its inode tables waited=0 while waited < 20 and not os.path.exists(filename) : time.sleep(0.1) waited += 1 #time.sleep(0.5) if os.path.exists(filename) : #while time.time() < os.path.getmtime(filename) + 0.2 : time.sleep(0.2) if re.findall(".nc$",filename) : command="ncatted -h -a CRS_def,global,o,c,\"%s\" -a CliMAF,global,o,c,\"CLImate Model Assessment Framework version %s (http://climaf.rtfd.org)\" %s"%\ (crs,version,filename) if re.findall(".png$",filename) : command="convert -set \"CRS_def\" \"%s\" -set \"CliMAF\" \"CLImate Model Assessment Framework version %s (http://climaf.rtfd.org)\" %s %s.png && mv -f %s.png %s"%\ (crs,version,filename,filename,filename,filename) clogger.debug("trying stamping by %s"%command) if ( os.system(command) == 0 ) : crs2filename[crs]=filename clogger.info("%s registered as %s"%(crs,filename)) return True else : clogger.critical("cannot stamp by %s"%command) return None else : clogger.error("file %s does not exist (for crs %s)"%(filename,crs))
def register(filename,crs): """ Adds in FILE a metadata named CRS_def and with value CRS. Records this FILE in dict crs2filename Silently skip non-existing files """ # First read index from file if it is yet empty if len(crs2filename.keys()) == 0 : cload() # It appears that we have to allow the file system some time for updating its inode tables waited=0 while waited < 10 and not os.path.exists(filename) : time.sleep(0.5) waited += 1 time.sleep(0.5) if os.path.exists(filename) : #while time.time() < os.path.getmtime(filename) + 0.2 : time.sleep(0.2) if re.findall(".nc$",filename) : command="ncatted -h -a CRS_def,global,o,c,\"%s\" %s"%(crs,filename) if re.findall(".png$",filename) : command="convert -set \"CRS_def\" \"%s\" %s %s.png && mv -f %s.png %s"%\ (crs,filename,filename,filename,filename) clogger.debug("trying stamping by %s"%command) if ( os.system(command) == 0 ) : crs2filename[crs]=filename clogger.info("%s registered as %s"%(crs,filename)) return True else : clogger.critical("cannot stamp by %s"%command) return None else : clogger.error("file %s does not exist (for crs %s)"%(filename,crs))
def selectEmFiles(**kwargs): # Pour A et L : mon, day1, day2, 6hLev, 6hPlev, 3h simulation = kwargs['simulation'] frequency = kwargs['frequency'] variable = kwargs['variable'] period = kwargs['period'] realm = kwargs['realm'] # freqs = {"mon": "", "3h": "_3h"} f = frequency if f in freqs: f = freqs[f] rep = [] # Must look for all realms, here identified by a single letter if realm == "*": lrealm = ["A", "L", "O", "I"] else: lrealm = [realm] for realm in lrealm: clogger.debug("Looking for realm " + realm) # Use EM data for finding data dir freq_for_em = f if realm == 'I': freq_for_em = "" # This is a special case ... command = [ "grep", "^export EM_DIRECTORY_" + realm + freq_for_em + "=", os.path.expanduser(os.getenv("EM_HOME")) + "/expe_" + simulation ] try: ex = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: clogger.error("Issue getting archive_location for " + simulation + " for realm " + realm + " with: " + repr(command)) break if ex.wait() == 0: dir = ex.stdout.read().split("=")[1].replace('"', "").replace("\n", "") clogger.debug("Looking at dir " + dir) if os.path.exists(dir): lfiles = os.listdir(dir) for fil in lfiles: # clogger.debug("Looking at file "+fil) fileperiod = periodOfEmFile(fil, realm, f) if fileperiod and period.intersects(fileperiod): if fileHasVar(dir + "/" + fil, variable): rep.append(dir + "/" + fil) # clogger.debug("Done with Looking at file "+fil) else: clogger.error( "Directory %s does not exist for simulation %s, realm %s " "and frequency %s" % (dir, simulation, realm, f)) else: clogger.info("No archive location found for " + simulation + " for realm " + realm + " with: " + repr(command)) return rep
def csync(update=False): """ Merges current in-memory cache index and current on-file cache index for updating both If arg `update` is True, additionally ensures consistency between files set and index content, either : - if cache.stamping is true, by reading CRS in all files - else, by removing files which are not in the index; this may erase result files which have been computed by another running instance of CliMAF """ # import pickle global cacheIndexFileName global dropped_crs # Merge index on file and index in memory file_index = cload(True) for crs in dropped_crs: file_index.pop(crs, None) crs2filename.update(file_index) # check if cache index is up to date; if not enforce consistency if update: clogger.info("Listing crs from files present in cache") files_in_cache = list_cache() files_in_cache.sort() files_in_index = crs2filename.values() files_in_index.sort() if files_in_index != files_in_cache: if stamping: clogger.info("Rebuilding cache index from file content") rebuild() else: clogger.warning( 'In no stamp mode, there is no way to seriously identify CRS from files in cache !' ) # clogger.warning('Removing cache files which content is not known. # This is an issue in concurrent mode !') # for fil in files_in_cache : # if fil not in files_in_index : # os.system("rm %"%fil) # else : # Should also remove empty files, as soon as # file creation will be atomic enough # Save index to disk fn = os.path.expanduser(cacheIndexFileName) try: with open(fn, "w") as cacheIndexFile: pickle.dump(crs2filename, cacheIndexFile) dropped_crs = [] except: if update: if os.path.isfile(fn) and len(files_in_cache > 0): clogger.error("Issue when writing cache index %s" % fn)
def cload(alt=None): global crs2filename global crs_not_yet_evaluable rep = dict() if len(crs2filename) != 0 and not alt: Climaf_Cache_Error( "attempt to reset file index - would lead to inconsistency !") try: cacheIndexFile = file(os.path.expanduser(cacheIndexFileName), "r") if alt: rep = pickle.load(cacheIndexFile) else: crs2filename = pickle.load(cacheIndexFile) cacheIndexFile.close() except: pass # clogger.debug("no index file yet") # must_check_index_entries = False if must_check_index_entries: # We may have some crs inherited from past sessions and for which # some operator may have become non-standard, or some projects are yet # undeclared crs_not_yet_evaluable = dict() allow_error_on_ds() for crs in crs2filename.copy(): try: # print "evaluating crs="+crs eval(crs, sys.modules['__main__'].__dict__) except: print("Inconsistent cache object is skipped : %s" % crs) # clogger.debug("Inconsistent cache object is skipped : %s"%crs) p = guess_projects(crs) if p not in crs_not_yet_evaluable: crs_not_yet_evaluable[p] = dict() crs_not_yet_evaluable[p][crs] = crs2filename[crs] crs2filename.pop(crs) # Analyze projects of inconsistent cache objects projects = crs_not_yet_evaluable.keys() if projects: clogger.info( "The cache has %d objects for non-declared projects %s.\n" "For using it, consider including relevant project(s) " "declaration(s) in ~/.climaf and restarting CliMAF.\n" "You can also declare these projects right now and call 'csync(True)'\n" "Or you can erase corresponding data by 'crm(pattern=...project name...)'" % (len(crs_not_yet_evaluable), repr(list(projects)))) allow_error_on_ds(False) if alt: return rep
def cdrop(obj, rm=True): """ Deletes the cached file for a CliMAF object, if it exists Args: obj (cobject or string) : object to delete, or its string representation (CRS) rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index Returns: None if object does not exists, False if failing to delete, True if OK Example :: >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> f=cfile(dg) >>> os.system('ls -al '+f) >>> cdrop(dg) """ global crs2filename if (isinstance(obj, cobject)): crs = ` obj ` if (isinstance(obj, cdataset)): crs = "select(" + crs + ")" elif type(obj) is str: crs = obj else: clogger.error("%s is not a CliMAF object" % ` obj `) return if crs in crs2filename: clogger.info("discarding cached value for " + crs) fil = crs2filename.pop(crs) if rm: try: path_file = os.path.dirname(fil) os.remove(fil) try: os.rmdir(path_file) except OSError as ex: clogger.warning(ex) return True except: clogger.warning( "When trying to remove %s : file does not exist in cache" % crs) return False else: clogger.info("%s is not cached" % crs) return None
def selectEmFiles(**kwargs) : #POur A et L : mon, day1, day2, 6hLev, 6hPlev, 3h simulation=kwargs['simulation'] frequency=kwargs['frequency'] variable=kwargs['variable'] period=kwargs['period'] realm=kwargs['realm'] # freqs={ "mon" : "" , "3h" : "_3h"} f=frequency if f in freqs : f=freqs[f] rep=[] # Must look for all realms, here identified by a single letter if realm=="*" : lrealm= ["A", "L", "O", "I" ] else: lrealm=[ realm ] for realm in lrealm : clogger.debug("Looking for realm "+realm) # Use EM data for finding data dir freq_for_em=f if realm == 'I' : freq_for_em="" # This is a special case ... command=["grep", "^export EM_DIRECTORY_"+realm+freq_for_em+"=", os.path.expanduser(os.getenv("EM_HOME"))+"/expe_"+simulation ] try : ex = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except : clogger.error("Issue getting archive_location for "+ simulation+" for realm "+realm+" with: "+`command`) break if ex.wait()==0 : dir=ex.stdout.read().split("=")[1].replace('"',"").replace("\n","") clogger.debug("Looking at dir "+dir) if os.path.exists(dir) : lfiles= os.listdir(dir) for fil in lfiles : #clogger.debug("Looking at file "+fil) fileperiod=periodOfEmFile(fil,realm,f) if fileperiod and period.intersects(fileperiod) : if fileHasVar(dir+"/"+fil,variable) : rep.append(dir+"/"+fil) #clogger.debug("Done with Looking at file "+fil) else : clogger.error("Directory %s does not exist for EM simulation %s, realm %s " "and frequency %s"%(dir,simulation,realm,f)) else : clogger.info("No archive location found for "+ simulation+" for realm "+realm+" with: "+`command`) return rep
def cload() : global crs2filename global crs_not_yet_evaluable if len(crs2filename) != 0 : Climaf_Driver_Error( "attempt to reset file index - would lead to inconsistency !") try : cacheIndexFile=file(os.path.expanduser(cacheIndexFileName), "r") crs2filename=pickle.load(cacheIndexFile) cacheIndexFile.close() except: pass #clogger.debug("no index file yet") # must_check_index_entries=False if (must_check_index_entries) : # We may have some crs inherited from past sessions and for which # some operator may have become non-standard, or some projects are yet # undeclared crs_not_yet_evaluable=dict() allow_error_on_ds() for crs in crs2filename.copy() : try : #print "evaluating crs="+crs eval(crs, sys.modules['__main__'].__dict__) except: print ("Inconsistent cache object is skipped : %s"%crs) #clogger.debug("Inconsistent cache object is skipped : %s"%crs) p=guess_projects(crs) if p not in crs_not_yet_evaluable : crs_not_yet_evaluable[p]=dict() crs_not_yet_evaluable[p][crs]=crs2filename[crs] crs2filename.pop(crs) # Analyze projects of inconsistent cache objects projects= crs_not_yet_evaluable.keys() if projects : clogger.info( "The cache has %d objects for non-declared projects %s.\n" "For using it, consider including relevant project(s) " "declaration(s) in ~/.climaf and restarting CliMAF.\n" "You can also declare these projects right now and call 'csync(True)'\n" "Or you can erase corresponding data by 'crm(pattern=...project name...)'"% \ (len(crs_not_yet_evaluable),`list(projects)`)) allow_error_on_ds(False)
def read(filename): """ Read macro dictionary from filename, and add it to cmacros[] """ import json global cmacros macros_texts=None try : macrofile=file(os.path.expanduser(filename), "r") clogger.debug("Macrofile %s read"%(macrofile)) macros_texts=json.load(macrofile) clogger.debug("After reading file %s, macros=%s"%(macrofile,`macros_texts`)) macrofile.close() except: clogger.info("Issue reading macro file %s ", filename) if macros_texts : for m in macros_texts : clogger.debug("loading macro %s=%s"%(m,macros_texts[m])) macro(str(m),str(macros_texts[m]))
def cdrop(obj, rm=True) : """ Deletes the cached file for a CliMAF object, if it exists Args: obj (cobject or string) : object to delete, or its string representation (CRS) rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index Returns: None if object does not exists, False if failing to delete, True if OK Example :: >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> f=cfile(dg) >>> os.system('ls -al '+f) >>> cdrop(dg) """ global crs2filename if (isinstance(obj,cobject) ): crs=`obj` if (isinstance(obj, cdataset) ) : crs="select("+crs+")" elif type(obj) is str : crs=obj else : clogger.error("%s is not a CliMAF object"%`obj`) return if crs in crs2filename : clogger.info("discarding cached value for "+crs) fil=crs2filename.pop(crs) if rm : try : os.remove(fil) return True except: clogger.warning("When trying to remove %s : file does not exist in cache"%crs) return False else : clogger.info("%s is not cached"%crs) return None
def read(filename): """ Read macro dictionary from filename, and add it to cmacros[] """ import json global cmacros macros_texts = None try: macrofile = file(os.path.expanduser(filename), "r") clogger.debug("Macrofile %s read" % (macrofile)) macros_texts = json.load(macrofile) clogger.debug("After reading file %s, macros=%s" % (macrofile, ` macros_texts `)) macrofile.close() except: clogger.info("Issue reading macro file %s ", filename) if macros_texts: for m in macros_texts: clogger.debug("loading macro %s=%s" % (m, macros_texts[m])) macro(str(m), str(macros_texts[m]))
def cprotect(obj, stop=False): """ Protects the cache file for a given object (or stops protection with arg 'stop=True'). In order to erase it, argument 'force=True' must then be used with function :py:func:`~climaf.cache.craz` or :py:func:`~climaf.cache.cdrop` """ if isinstance(obj, cobject): crs = repr(obj) if isinstance(obj, cdataset): crs = "select(" + crs + ")" elif type(obj) is str: crs = obj else: clogger.error("%s is not a CliMAF object" % repr(obj)) return if crs in crs2filename: if stop is False: clogger.info("Protecting cached value for " + crs) os.system("chmod -w " + crs2filename[crs]) else: clogger.info("Stopping protection on cached value for " + crs) os.system("chmod +w " + crs2filename[crs]) return else: clogger.info("%s is not (yet) cached; use cfile() to cache it" % crs)
def csync(update=False): """ Merges current in-memory cache index and current on-file cache index for updating both If arg `update` is True, additionnaly ensures consistency between files set and index content, either : - if cache.stamping is true, by reading CRS in all files - else, by removing files which are not in the index; this may erase result files which have been computed by another running instance of CliMAF """ # import pickle global cacheIndexFileName # Merge index on file and index in memory file_index = cload(True) crs2filename.update(file_index) # check if cache index is up to date; if not # enforce consistency if update: clogger.info("Listing crs from files present in cache") files_in_cache = list_cache() files_in_cache.sort() files_in_index = crs2filename.values() files_in_index.sort() if files_in_index != files_in_cache: if stamping: clogger.info("Rebuilding cache index from file content") rebuild() else: clogger.info('Removing cache files which content is not known') for fil in files_in_cache: if fil not in files_in_index: os.system("rm %" % fil) #else : # Should also remove empty files, as soon as # file creation will be atomic enough # Save to disk try: cacheIndexFile = file(os.path.expanduser(cacheIndexFileName), "w") pickle.dump(crs2filename, cacheIndexFile) cacheIndexFile.close() except: clogger.info("No cache index file yet")
def selectGenericFiles(urls, return_wildcards=None,merge_periods_on=None,**kwargs): """ Allow to describe a ``generic`` file organization : the list of files returned by this function is composed of files which : - match the patterns in ``url`` once these patterns are instantiated by the values in kwargs, and - contain the ``variable`` provided in kwargs - match the `period`` provided in kwargs In the pattern strings, no keyword is mandatory. However, for remote files, filename pattern must include ${varname}, which is instanciated by variable name or ``filenameVar`` (given via :py:func:`~climaf.classes.calias()`); this is for the sake of efficiency (please complain if inadequate) Example : >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*${PERIOD}*.nc)'] /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc In the pattern strings, the keywords that can be used in addition to the argument names (e.g. ${model}) are: - ${variable} : use it if the files are split by variable and filenames do include the variable name, as this speed up the search - ${PERIOD} : use it for indicating the period covered by each file, if this is applicable in the file naming; this period can appear in filenames as YYYY, YYYYMM, YYYYMMDD, YYYYMMDDHHMM, either once only, or twice with separator ='-' or '_' - wildcards '?' and '*' for matching respectively one and any number of characters """ def store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards, merge_periods_on=None, fperiod=None,periods=None,periods_dict=None): """" """ if fperiod is not None and periods is not None : clogger.debug('Adding period %s'%fperiod) periods.append(fperiod) # for kw in kwargs : it=re.finditer(facets_regexp,f) for oc in it : try : facet_value=oc.group(kw) except : continue if type(kwargs[kw]) is str and ("*" in kwargs[kw] or "?" in kwargs[kw] ): if facet_value is not None : if kw not in wildcards : wildcards[kw]=set() wildcards[kw].add(facet_value) clogger.debug("Discover %s=%s for file=%s"%(kw,facet_value,f)) else : clogger.debug("Logic issue for kw=%s and file=%s"%(kw,f)) # if fperiod is not None and periods is not None : if merge_periods_on is None : key=None elif kw == merge_periods_on : key=facet_value else : #print "Skipping for kw=%s,sort=%s"%(kw,merge_periods_on) continue if key not in periods_dict: periods_dict[key]=set() #print "adding period %s for key %s"%(fperiod,key) periods_dict[key].add(fperiod) else: pass #print "no Adding period for %s=%s for %s"%(kw,facet_value,f) #print "end of store, periods_dict=",periods_dict, "wild=",wildcards rep=[] # periods=None # a list of periods available periods_dict=dict() # period=kwargs['period'] ; if period == "*" : periods=[] # List of all periods elif type(period) is str : period=init_period(period) # variable=kwargs['variable'] altvar=kwargs.get('filenameVar',variable) # # dicts of date patterns, for globbing and for regexp # digit="[0-9]" date_glob_patt={ "${PERIOD}" : "*" } # an ordered list of dates keywords date_keywords=date_glob_patt.keys() ; date_keywords.sort(reverse=True) # annee="%s{4}"%digit mois="(01|02|03|04|05|06|07|08|09|10|11|12)" jour="([0-3][0-9])" heure="(00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23)" minutes="[0-5][0-9]" date="%s(%s(%s(%s(%s)?)?)?)?"%(annee,mois,jour,heure,minutes) rperiod="(?P<period>(?P<start>%s)([_-](?P<end>%s))?)"%(date,date) date_regexp_patt={ "${PERIOD}" : rperiod } # an ordered list of dates regexp keywords date_regexp_keywords=date_regexp_patt.keys() ; date_regexp_keywords.sort(reverse=True) # # for l in urls : # Instantiate keywords in pattern with attributes values remote_prefix="" ; if re.findall(".*:.*",l) : remote_prefix=':'.join(l.split(":")[0:-1])+':' basename=l.split(":")[-1] # This discard the remote_prefix if any basename=basename.replace("//","/") my_template=Template(basename) template=my_template.safe_substitute(**kwargs) #print "template after attributes replace : "+template # # Construct a pattern for globbing dates temp2=template for k in date_keywords : temp2=temp2.replace(k,date_glob_patt[k]) # Do globbing with plain varname if remote_prefix : lfiles=sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Remote globbing %d files for varname on %s : "%\ (len(lfiles),remote_prefix+temp2)) else: # local data lfiles=sorted(glob.glob(temp2)) clogger.debug("Before regexp filtering : Globbing %d files for varname on %s : "%(len(lfiles),temp2)) # Must filter with regexp, because * with glob is too inclusive alt=[] for f in lfiles : for k in date_keywords : if re.search(date_regexp_patt[k],f) : alt.append(f) continue lfiles=alt clogger.debug("Globbing %d files for varname on %s : "%(len(lfiles),temp2)) # # If unsuccessful using varname, try with filenameVar if len(lfiles)==0 and "filenameVar" in kwargs and kwargs['filenameVar'] : # Change value of facet 'variable' kwargs['variable']=kwargs['filenameVar'] template=my_template.safe_substitute(**kwargs) temp2=template for k in date_keywords : temp2=temp2.replace(k,date_glob_patt[k]) # # Do globbing with fileVarname if remote_prefix : # lfiles=sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Remote globbing %d files for filenamevar on %s: "%\ (len(lfiles),remote_prefix+temp2)) else: # local data lfiles=sorted(glob.glob(temp2)) # Must filter with regexp, because * with glob is too inclusive alt=[] for f in lfiles : for k in date_keywords : if re.search(date_regexp_patt[k],f) : alt.append(f) continue lfiles=alt clogger.debug("Globbing %d files for filenamevar on %s: "%(len(lfiles),temp2)) # # For discovering values for those facets which are a wildcard, # construct a regexp with a group name for all facets (but period) alt_basename=basename.replace("?",".").replace("*",".*") alt_kwargs=kwargs.copy() for kw in kwargs : if type(kwargs[kw]) is str : # This excludes period attribute, which has a type alt_kwargs[kw]=kwargs[kw].replace("?",".").replace("*",".*") alt_basename=alt_basename.replace(r"${%s}"%kw,r"(?P<%s>%s)"%(kw,alt_kwargs[kw]),1) facets_regexp=Template(alt_basename).safe_substitute(**alt_kwargs) for k in date_regexp_keywords : facets_regexp=facets_regexp.replace(k,date_regexp_patt[k],1) facets_regexp=facets_regexp.replace(k,".*") wildcards=dict() #print "facets_regexp=",facets_regexp # # Construct regexp for extracting dates from filename date_regexp=None template_toreg=template.replace("*",".*").replace("?",r".").replace("+","\+") #print "template before searching dates : "+template_toreg for key in date_regexp_keywords : #print "searchin "+key+" in "+template start=template_toreg.find(key) if (start>=0 ) : date_regexp=template_toreg.replace(key,date_regexp_patt[key],1) #print "found ",key," dateregexp ->",date_regexp hasEnd=False start=date_regexp.find(key) #start=date_regexp.find(key) if (start >=0 ) : hasEnd=True date_regexp=date_regexp.replace(key,date_regexp_patt[key],1) #date_regexp=date_regexp.replace(key,date_regexp_patt[key],1) break #print "date_regexp before searching dates : "+date_regexp # for f in lfiles : #print "processing file "+f # # Extract file time period # fperiod=None if date_regexp : if "P<period>" in date_regexp : #print "date_rexgep=",date_regexp #print "f=",f #print "period=",re.sub(date_regexp,r'\g<period>',f) tperiod=re.sub(date_regexp,r'\g<period>',f) if tperiod==f : raise classes.Climaf_Error("Cannot find a period in %s with regexp %s"%(f,date_regexp)) fperiod=init_period(tperiod) else: date_regexp0=date_regexp #print "date_regexp for extracting dates : "+date_regexp0, "file="+f start=re.sub(date_regexp0,r'\1',f) if start==f: raise Climaf_Data_Error("Start period not found in %s using regexp %s"%(f,regexp0)) #? if hasEnd : end=re.sub(date_regexp0,r'\2',f) fperiod=init_period("%s-%s"%(start,end)) else : fperiod=init_period(start) #print "period for file %s is %s"%(f,fperiod) # # Filter file time period against required period else : if ( 'frequency' in kwargs and ((kwargs['frequency']=="fx") or \ kwargs['frequency']=="seasonnal" or kwargs['frequency']=="annual_cycle" )) : # local data if not remote_prefix and \ ( (basename.find("${variable}")>=0) or variable=='*' or \ fileHasVar(f,variable) or (variable != altvar and fileHasVar(f,altvar)) ) : clogger.debug("adding fixed field :"+f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(f) # remote data elif remote_prefix : if (basename.find("${variable}")>=0) or variable=='*' or \ (variable != altvar and (f.find(altvar)>=0) ): clogger.debug("adding fixed field :"+remote_prefix+f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(remote_prefix+f) else: raise classes.Climaf_Error( "For remote files, filename pattern (%s) should include ${varname} "+\ "(which is instanciated by variable name or filenameVar)"%f) else : clogger.info("Cannot yet filter files re. time using only file content.") store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(f) # # If file period matches requested period, check similarly for variable # #print "fperiod=",fperiod #print "periods=",periods #print "inter=",period.intersects(fperiod) #print "date_regexp=",date_regexp if (fperiod and ( periods is not None or period.intersects(fperiod) )) \ or not date_regexp : # clogger.debug('Period is OK - Considering variable filtering on %s and %s for %s'%(variable,altvar,f)) # Filter against variable if (l.find("${variable}")>=0): clogger.debug('appending %s based on variable in filename'%f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on, fperiod,periods,periods_dict) rep.append(remote_prefix+f) continue if (f not in rep): # local data if not remote_prefix and \ (variable=='*' or "," in variable or fileHasVar(f,variable) or \ (altvar != variable and fileHasVar(f,altvar))) : # Should check time period in the file if not date_regexp clogger.debug('appending %s based on multi-var or var exists in file '%f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on, fperiod,periods,periods_dict) rep.append(f) continue # remote data elif remote_prefix : if variable=='*' or "," in variable or \ (variable != altvar and (f.find(altvar)>=0) ): # Should check time period in the file if not date_regexp clogger.debug('appending %s based on multi-var or altvar '%(remote_prefix+f)) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards, merge_periods_on, fperiod,periods,periods_dict) rep.append(remote_prefix+f) continue else: mess="For remote files, filename pattern (%s) should include"%(remote_prefix+f) mess+=" ${varname} (which is instanciated by variable name or filenameVar)" raise classes.Climaf_Error(mess) else: if not fperiod : clogger.debug('not appending %s because period is None '%f) elif not period.intersects(fperiod) : clogger.debug('not appending %s because period doesn t intersect %s'%(f,period)) else: clogger.debug('not appending %s for some other reason %s'%(f)) # Break on first url with any matching data if len(rep)>0 : clogger.debug('url %s does match for '%l + `kwargs`) break # For wildcard facets, discover facet values + checks for facet in wildcards: s=wildcards[facet] if return_wildcards is not None : if facet=="period" : #print "s=",s," periods_dict=",periods_dict for val in periods_dict : periods_dict[val]=sort_periods_list(list(periods_dict[val])) clogger.info("Attribute period='*' has values %s"%(periods_dict)) return_wildcards["period"]=periods_dict else: if len(s) == 1 : s=s.pop() clogger.info("Attribute %s='%s' has matching value '%s'"%(facet,kwargs[facet],s)) return_wildcards[facet]=s else: rep=list(s); rep.sort() return_wildcards[facet]=rep message="Attribute %s='%s' has multiple values : %s"%(facet,kwargs[facet],list(s)) if return_wildcards : clogger.info(message) else: clogger.error(message) s=return_wildcards[facet] else: clogger.debug("return_wildcards is None") return rep
def cdrop(obj, rm=True, force=False): """ Deletes the cached file for a CliMAF object, if it exists Args: obj (cobject or string) : object to delete, or its string representation (CRS) force (bool) : should we delete the object even if it is 'protected' rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index Returns: None if object does not exists, False if failing to delete, True if OK Example :: >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981') >>> f=cfile(dg) >>> os.system('ls -al '+f) >>> cdrop(dg) """ global crs2filename global dropped_crs if isinstance(obj, cobject): crs = repr(obj) if isinstance(obj, cdataset): crs = "select(" + crs + ")" elif type(obj) is str: crs = obj else: clogger.error("%s is not a CliMAF object" % repr(obj)) return if crs in crs2filename: clogger.info("Discarding cached value for %s (expect if protected)" % crs) fil = crs2filename[crs] if rm: try: if force: os.system("chmod +w " + fil) if not os.access(fil, os.W_OK): clogger.info("Object %s is protected" % crs) return path_file = os.path.dirname(fil) os.remove(fil) crs2filename.pop(crs) dropped_crs.append(crs) try: os.rmdir(path_file) except OSError as ex: clogger.warning(ex) return True except: clogger.warning( "When trying to remove %s : file does not exist in cache" % crs) return False else: clogger.info("%s is not cached" % crs) return None
def register(filename, crs, outfilename=None): """ Adds in FILE a metadata named 'CRS_def' and with value CRS, and a metadata 'CLiMAF' with CliMAF version and ref URL Records this FILE in dict crs2filename If OUTFILENAME is not None, FILENAME is a temporary file and it's OUTFILENAME which is recorded in dict crs2filename Silently skip non-existing files """ # First read index from file if it is yet empty - No : done at startup # if len(crs2filename.keys()) == 0 : cload() # It appears that we have to let some time to the file system for updating its inode tables global dropped_crs if not stamping: clogger.debug('No stamping') crs2filename[crs] = filename return True waited = 0 while waited < 50 and not os.path.exists(filename): time.sleep(0.1) waited += 1 # time.sleep(0.5) if os.path.exists(filename): # while time.time() < os.path.getmtime(filename) + 0.2 : time.sleep(0.2) if re.findall(".nc$", filename): command = "ncatted -h -a CRS_def,global,o,c,\"%s\" -a CliMAF,global,o,c,\"CLImate Model Assessment " \ "Framework version %s (http://climaf.rtfd.org)\" %s" % (crs, version, filename) if re.findall(".png$", filename): crs2 = crs.replace("%", "\%") command = "convert -set \"CRS_def\" \"%s\" -set \"CliMAF\" \"CLImate Model Assessment Framework version " \ "%s (http://climaf.rtfd.org)\" %s %s.png && mv -f %s.png %s" % \ (crs2, version, filename, filename, filename, filename) if re.findall(".pdf$", filename): tmpfile = str(uuid.uuid4()) command = "pdftk %s dump_data output %s && echo -e \"InfoBegin\nInfoKey: Keywords\nInfoValue: %s\" >> %s " \ "&& pdftk %s update_info %s output %s.pdf && mv -f %s.pdf %s && rm -f %s" % \ (filename, tmpfile, crs, tmpfile, filename, tmpfile, filename, filename, filename, tmpfile) if re.findall(".eps$", filename): command = 'exiv2 -M"add Xmp.dc.CliMAF CLImate Model Assessment Framework version %s ' \ '(http://climaf.rtfd.org)" -M"add Xmp.dc.CRS_def %s" %s' % \ (version, crs, filename) clogger.debug("trying stamping by %s" % command) if os.system(command) == 0: if outfilename: cmd = 'mv -f %s %s ' % (filename, outfilename) if os.system(cmd) == 0: clogger.info("move %s as %s " % (filename, outfilename)) clogger.info("%s registered as %s" % (crs, outfilename)) crs2filename[crs] = outfilename if crs in dropped_crs: dropped_crs.remove(crs) return True else: clogger.critical("cannot move by" % cmd) exit() return None else: clogger.info("%s registered as %s" % (crs, filename)) crs2filename[crs] = filename if crs in dropped_crs: dropped_crs.remove(crs) return True else: clogger.critical("cannot stamp by %s" % command) exit() return None else: clogger.error("file %s does not exist (for crs %s)" % (filename, crs))
def ceval_script (scriptCall,deep,recurse_list=[]): """ Actually applies a CliMAF-declared script on a script_call object Prepare operands as fiels and build command from operands and parameters list Assumes that scripts are described in dictionary 'scripts' by templates as documented in operators.cscript Returns a CLiMAF cache data filename """ script=operators.scripts[scriptCall.operator] template=Template(script.command) # Evaluate input data dict_invalues=dict() sizes=[] for op in scriptCall.operands : inValue=ceval(op,userflags=scriptCall.flags,format='file',deep=deep, recurse_list=recurse_list) if inValue is None or inValue is "" : raise Climaf_Driver_Error("When evaluating %s : value for %s is None"\ %(scriptCall.script,`op`)) if isinstance(inValue,list) : size=len(inValue) else : size=1 sizes.append(size) dict_invalues[op]=inValue # # Replace input data placeholders with filenames subdict=dict() opscrs="" if 0 in script.inputs : label,multiple,serie=script.inputs[0] op=scriptCall.operands[0] infile=dict_invalues[op] if not all(map(os.path.exists,infile.split(" "))) : raise Climaf_Driver_Error("Internal error : some input file does not exist among %s:"%(infile)) subdict[ label ]=infile #if scriptCall.flags.canSelectVar : subdict["var"]=varOf(op) if isinstance(op,classes.cdataset) and op.alias and scriptCall.flags.canAlias: filevar,scale,offset,units,filenameVar,missing=op.alias #if script=="select" and ((varOf(op) != filevar) or scale != 1.0 or offset != 0.) : if ((varOf(op) != filevar) or scale != 1.0 or offset != 0.) : subdict["alias"]="%s,%s,%.4g,%.4g"%(varOf(op),filevar,scale,offset) subdict["var"]=filevar if units : subdict["units"]=units if scriptCall.flags.canMissing and missing : subdict["missing"]=missing if isinstance(op,classes.cens) : if not multiple : raise Climaf_Driver_Error( "Script %s 's input #%s cannot accept ensemble %s"\ %(scriptCall.script,0,`op`)) #subdict["labels"]=r'"'+reduce(lambda x,y : "'"+x+"' '"+y+"'", op.labels)+r'"' subdict["labels"]=reduce(lambda x,y : x+"$"+y, op.labels) per=timePeriod(op) if not per.fx and str(per) != "" and scriptCall.flags.canSelectTime: subdict["period"]=str(per) subdict["period_iso"]=per.iso() if scriptCall.flags.canSelectDomain : subdict["domain"]=domainOf(op) i=0 for op in scriptCall.operands : opscrs += op.crs+" - " infile=dict_invalues[op] if not all(map(os.path.exists,infile.split(" "))) : raise Climaf_Driver_Error("Internal error : some input file does not exist among %s:"%(infile)) i+=1 if ( i> 1 or 1 in script.inputs) : label,multiple,serie=script.inputs[i] subdict[ label ]=infile # Provide the name of the variable in input file if script allows for subdict["var_%d"%i]=varOf(op) if isinstance(op,classes.cdataset) and op.alias : filevar,scale,offset,units,filenameVar,missing =op.alias if (varOf(op) != filevar) or (scale != 1.0) or (offset != 0.) : subdict["alias_%d"%i]="%s %s %f %f"%(varOf(op),filevar,scale,offset) subdict["var_%d"%i]=filevar if units : subdict["units_%d"%i]=units if missing : subdict["missing_%d"%i]=missing # Provide period selection if script allows for per=timePeriod(op) if not per.fx and per != "": subdict["period_%d"%i]=str(per) subdict["period_iso_%d"%i]=per.iso() subdict["domain_%d"%i]=domainOf(op) clogger.debug("subdict for operands is "+`subdict`) # substitution is deffered after scriptcall parameters evaluation, which may # redefine e.g period # # Provide one cache filename for each output and instantiates the command accordingly if script.outputFormat is not None : # Compute a filename for each ouptut # Un-named main output main_output_filename=cache.generateUniqueFileName(scriptCall.crs, format=script.outputFormat) subdict["out"]=main_output_filename subdict["out_"+scriptCall.variable]=main_output_filename # Named outputs for output in scriptCall.outputs: subdict["out_"+output]=cache.generateUniqueFileName(scriptCall.crs+"."+output,\ format=script.outputFormat) # Account for script call parameters for p in scriptCall.parameters : #clogger.debug("processing parameter %s=%s"%(p,scriptCall.parameters[p])) subdict[p]=scriptCall.parameters[p] if p=="period" : subdict["period_iso"]=init_period(scriptCall.parameters[p]).iso() subdict["crs"]=opscrs.replace("'","") # # Combine CRS and possibly member_label to provide/complement title if 'title' not in subdict : if 'member_label' in subdict : subdict["title"]=subdict['member_label'] else: subdict["title"]=subdict["crs"] else: if 'member_label' in subdict : subdict["title"]=subdict["title"]+" "+subdict['member_label'] subdict.pop('member_label') # # Substitute all args template=template.safe_substitute(subdict) # # Allowing for some formal parameters to be missing in the actual call: # # Discard remaining substrings looking like : # some_word='"${some_keyword}"' , or: # '"${some_keyword}"' template=re.sub(r'(\w*=)?(\'\")?\$\{\w*\}(\"\')?',r"",template) # # Discard remaining substrings looking like : # some_word=${some_keyword} , or # ${some_keyword} template=re.sub(r"(\w*=)?\$\{\w*\}",r"",template) # # Launch script using command, and check termination #command="PATH=$PATH:"+operators.scriptsPath+template+fileVariables #command="echo '\n\nstdout and stderr of script call :\n\t "+template+\ # "\n\n'> scripts.out ; "+ template+ " >> scripts.out 2>&1" tim1=time.time() clogger.info("Launching command:"+template) # command=subprocess.Popen(template, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) command.wait() # logfile=open('last.out', 'w') logfile.write("\n\nstdout and stderr of script call :\n\t "+template+"\n\n") command_std="" for line in command.stdout: command_std+=line logfile.write(line) logfile.close() if ( command.wait() == 0 ): if script.outputFormat is not None : # Tagging output files with their CliMAF Reference Syntax definition # Un-named main output ok = cache.register(main_output_filename,scriptCall.crs) # Named outputs for output in scriptCall.outputs: ok = ok and cache.register(subdict["out_"+output],\ scriptCall.crs+"."+output) if ok : duration=time.time() - tim1 print("Done in %.1f s with script computation for %s "%\ (duration,`scriptCall`),file=sys.stderr) clogger.debug("Done in %.1f s with script computation for " "%s (command was :%s )"%\ (duration,`scriptCall`,template)) return main_output_filename else : raise Climaf_Driver_Error("Some output missing when executing " ": %s. \n See last.out"%template) else : clogger.debug("script %s has no output"%script.name) return None else: clogger.debug("Full script output:\n"+command_std) comm2=subprocess.Popen(["tail", "-n", "10", "last.out"], stdout=subprocess.PIPE) clogger.error("Last lines of script output:\n"+comm2.stdout.read()) raise Climaf_Driver_Error("Script failure for : %s. More details either in file " "./last.out or by re-runing with clog(\"debug\")" %template)
def ceval(cobject, userflags=None, format="MaskedArray", deep=None, derived_list=[], recurse_list=[]) : """ Actually evaluates a CliMAF object, either as an in-memory data structure or as a string of filenames (which either represent a superset or exactly includes the desired data) - with arg deep=True , re-evaluates all components - with arg deep=False, re-evaluates top level operation - without arg deep , use cached values as far as possible arg derived_list is the list of variables that have been considered as 'derived' (i.e. not natives) in upstream evaluations. It avoids to loop endlessly """ if format != 'MaskedArray' and format != 'file' and format != 'png' : raise Climaf_Driver_Error('Allowed formats yet are : "object", "file" and "png"') # if userflags is None : userflags=operators.scriptFlags() # # Next check is too crude for dealing with use of operator 'select' #if cobject.crs in recurse_list : # clogger.critical("INTERNAL ERROR : infinite loop on object: "+cobject.crs) # return None cindent() if isinstance(cobject,classes.cdataset): recurse_list.append(cobject.crs) clogger.debug("Evaluating dataset operand " + cobject.crs + "having kvp=" + `cobject.kvp`) ds=cobject if ds.isLocal() or ds.isCached() : clogger.debug("Dataset %s is local or cached "%ds ) # if the data is local, then # if the user can select the data and aggregate time, and requested format is # 'file' return the filenames # else : read the data, create a cache file for that, and recurse # Go to derived variable evaluation if appicable #if ds.variable in operators.derived_variables and not ds.hasRawVariable() : if operators.is_derived_variable(ds.variable,ds.project) : if ds.variable in derived_list : raise Climaf_Driver_Error("Loop detected while evaluating" "derived variable "+ ds.variable + " " + `derived_list`) derived=derive_variable(ds) clogger.debug("evaluating derived variable %s as %s"%\ (ds.variable,`derived`)) derived_value=ceval(derived, format=format, deep=deep, userflags=userflags, derived_list=derived_list+[ds.variable], recurse_list=recurse_list) if derived_value : clogger.debug("succeeded in evaluating derived variable %s as %s"%\ (ds.variable,`derived`)) set_variable(derived_value, ds.variable, format=format) cdedent() return(derived_value) elif ((userflags.canSelectVar or ds.oneVarPerFile() ) and \ (userflags.canSelectTime or ds.periodIsFine() ) and \ (userflags.canSelectDomain or ds.domainIsFine() ) and \ (userflags.canAggregateTime or ds.periodHasOneFile()) and \ (userflags.canAlias or ds.hasExactVariable()) and \ (userflags.canMissing or ds.missingIsOK()) and \ #(userflags.doSqueezeMembers or ds.hasOneMember()) and (format == 'file')) : clogger.debug("Delivering file set or sets is OK for the target use") cdedent() rep=ds.baseFiles() if not rep : raise Climaf_Driver_Error("No file found for %s"%`ds`) return(rep) # a single string with all filenames, #or a list of such strings in case of ensembles else: clogger.debug("Must subset and/or aggregate and/or select "+ "var from data files and/or get data, or provide object result") ## extract=cread(ds) ## clogger.debug(" Done with subsetting and caching data files") ## cstore(extract) # extract should include the dataset def ## return ceval(extract,userflags,format) clogger.debug("Fetching/selection/aggregation is done using an external script for now - TBD") extract=capply('select',ds) if extract is None : raise Climaf_Driver_Error("Cannot access dataset" + `ds`) rep=ceval(extract,userflags=userflags,format=format) userflags.unset_selectors() cdedent() return rep else : # else (non-local and non-cached dataset) # if the user can access the dataset by one of the dataset-specific protocols # then assume it can also select on time and provide it with the address # else : fetch the relevant selection of the data, and store it in cache clogger.debug("Dataset is remote " ) if (userflags.canOpenDap and format == 'file' ) : clogger.debug("But user can OpenDAP " ) cdedent() return(ds.adressOf()) else : clogger.debug("Must remote read and cache " ) rep=ceval(capply('remote_select',ds),userflags=userflags,format=format) userflags.unset_selectors() cdedent() return rep # elif isinstance(cobject,classes.ctree) or isinstance(cobject,classes.scriptChild) or \ isinstance(cobject,classes.cpage) or isinstance(cobject,classes.cens) : recurse_list.append(cobject.buildcrs()) clogger.debug("Evaluating compound object : " + `cobject`) ################################################################# if (deep is not None) : cache.cdrop(cobject.crs) # clogger.debug("Searching cache for exact object : " + `cobject`) ################################################################# filename=cache.hasExactObject(cobject) #filename=None if filename : clogger.info("Object found in cache: %s is at %s: "%(cobject.crs,filename)) cdedent() if format=='file' : return filename else: return cread(filename,varOf(cobject)) if not isinstance(cobject,classes.cpage) and not isinstance(cobject,classes.cens) : # clogger.debug("Searching cache for including object for : " + `cobject`) ######################################################################## it,altperiod=cache.hasIncludingObject(cobject) #clogger.debug("Finished with searching cache for including object for : " + `cobject`) #it=None if it : clogger.info("Including object found in cache : %s"%(it.crs)) clogger.info("Selecting "+`cobject`+" out of it") # Just select (if necessary for the user) the portion relevant to the request rep=ceval_select(it,cobject,userflags,format,deep,derived_list, recurse_list) cdedent() return rep # clogger.debug("Searching cache for begin object for : " + `cobject`) ######################################################################## it,comp_period=cache.hasBeginObject(cobject) clogger.debug("Finished with searching cache for begin object for : " + `cobject`) #it=None if it : clogger.info("partial result found in cache for %s : %s"%\ (cobject.crs,it.crs)) begcrs=it.crs # Turn object for begin in complement object for end, and eval it it.setperiod(comp_period) ceval(it,userflags,format,deep,derived_list,recurse_list) if (format == 'file') : rep=cache.complement(begcrs,it.crs,cobject.crs) cdedent() return rep else : raise Climaf_Driver_Error("cannot yet complement except for files") # clogger.info("nothing relevant found in cache for %s"%cobject.crs) # if deep==False : deep=None if isinstance(cobject,classes.ctree) : # # the cache doesn't have a similar tree, let us recursively eval subtrees ########################################################################## # TBD : analyze if the dataset is remote and the remote place 'offers' the operator if cobject.operator in operators.scripts : file=ceval_script(cobject,deep,recurse_list=recurse_list) # Does return a filename, or list of filenames cdedent() if ( format == 'file' ) : return (file) else : return cread(file,varOf(cobject)) elif cobject.operator in operators.operators : obj=ceval_operator(cobject,deep) cdedent() if (format == 'file' ) : rep=cstore(obj) ; return rep else : return(obj) else : raise Climaf_Driver_Error("operator %s is not a script nor known operator",str(cobject.operator)) elif isinstance(cobject,classes.scriptChild) : # Force evaluation of 'father' script if ceval_script(cobject.father,deep,recurse_list=recurse_list) is not None : # Re-evaluate, which should succeed using cache rep=ceval(cobject,userflags,format,deep,recurse_list=recurse_list) cdedent() return rep else : raise Climaf_Driver_Error("generating script aborted for "+cobject.father.crs) elif isinstance(cobject,classes.cpage) : file=cfilePage(cobject, deep, recurse_list=recurse_list) cdedent() if ( format == 'file' ) : return (file) else : return cread(file) elif isinstance(cobject,classes.cens) : rep=[] for member in cobject.members : rep.append(ceval(member,copy.copy(userflags),format,deep,recurse_list=recurse_list)) if (format=="file") : return(reduce(lambda x,y : x+" "+y, rep)) else : return rep else : raise Climaf_Driver_Error("Internal logic error") elif isinstance(cobject,str) : clogger.debug("Evaluating object from crs : %s"%cobject) raise Climaf_Driver_Error("Evaluation from CRS is not yet implemented ( %s )"%cobject) else : raise Climaf_Driver_Error("argument " +`cobject`+" is not (yet) managed")
def selectGenericFiles(urls, **kwargs): """ Allow to describe a ``generic`` file organization : the list of files returned by this function is composed of files which : - match the patterns in ``url`` once these patterns are instantiated by the values in kwargs, and - contain the ``variable`` provided in kwargs - match the `period`` provided in kwargs In the pattern strings, no keyword is mandatory. However, for remote files, filename pattern must include ${varname}, which is instanciated by variable name or ``filenameVar`` (given via :py:func:`~climaf.classes.calias()`); this is for the sake of efficiency (please complain if inadequate) Example : >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*YYYY*.nc)'] /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc In the pattern strings, the keywords that can be used in addition to the argument names (e.g. ${model}) are: - ${variable} : use it if the files are split by variable and filenames do include the variable name, as this speed up the search - YYYY, YYYYMM, YYYYMMDD : use it for indicating the start date of the period covered by each file, if this is applicable in the file naming; use a second time for end date, if applicable (otherwise the assumption is that the whole year -resp. month or day- is included in the file - wildcards '?' and '*' for matching respectively one and any number of characters """ rep = [] period = kwargs['period'] if type(period) is str: period = init_period(period) variable = kwargs['variable'] altvar = kwargs.get('filenameVar', variable) # a dict and an ordered list of date globbing patterns dt = dict(YYYY="????", YYYYMM="??????", YYYYMMDD="????????", YYYYMMDDHH="??????????") lkeys = dt.keys() lkeys.sort(reverse=True) # a dict and an ordered list for matching dates dr = dict(YYYY="([0-9]{4})", YYYYMM="([0-9]{6})", YYYYMMDD="([0-9]{8})", YYYYMMDDHH="([0-9]{10})") rkeys = dr.keys() rkeys.sort(reverse=True) # for l in urls: # Instantiate keywords in pattern with attributes values if re.findall(".*:.*", l): # remote data remote_prefix = ':'.join(l.split(":")[0:-1]) + ':' template = Template(l.split(":")[-1]).safe_substitute(**kwargs) else: # local data remote_prefix = "" template = Template(l).safe_substitute(**kwargs) #print "template after attributes replace : "+template # # Construct a pattern for globbing dates temp2 = template for k in lkeys: temp2 = temp2.replace(k, dt[k]) if remote_prefix: lfiles = sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Remote globbing %d files for varname on %s : " % (len(lfiles), remote_prefix + temp2)) else: # local data lfiles = sorted(glob.glob(temp2)) clogger.debug("Globbing %d files for varname on %s : " % (len(lfiles), temp2)) # # If unsuccessful using varname, try with filenameVar if len(lfiles ) == 0 and "filenameVar" in kwargs and kwargs['filenameVar']: # Change value of facet 'variable' kwargs['variable'] = kwargs['filenameVar'] if remote_prefix: # remote data template = Template(l.split(":")[-1]).safe_substitute(**kwargs) else: # local data template = Template(l).safe_substitute(**kwargs) temp2 = template for k in lkeys: temp2 = temp2.replace(k, dt[k]) # if remote_prefix: # lfiles = sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Globbing %d files for filenamevar on %s: " % (len(lfiles), remote_prefix + temp2)) else: # local data lfiles = sorted(glob.glob(temp2)) clogger.debug("Globbing %d files for filenamevar on %s: " % (len(lfiles), temp2)) # # Construct regexp for extracting dates from filename regexp = None #print "template before searching dates : "+template for key in rkeys: #print "searchin "+key+" in "+=Template(l) start = template.find(key) if (start >= 0): #print "found "+key regexp = template.replace(key, dr[key], 1) hasEnd = False start = regexp.find(key) if (start >= 0): hasEnd = True regexp = regexp.replace(key, dr[key], 1) break #print "regexp before searching dates : "+regexp # for f in lfiles: #print "processing file "+f # # Analyze file time period fperiod = None if regexp: regexp0 = regexp.replace("*", ".*").replace("?", r".") #print "regexp for extracting dates : "+regexp start = re.sub(regexp0, r'\1', f) if start == f: raise Climaf_Data_Error("Start period not found") #? if hasEnd: end = re.sub(regexp0, r'\2', f) fperiod = init_period("%s-%s" % (start, end)) else: fperiod = init_period(start) #print "period for file %s is %s"%(f,fperiod) # # Filter file time period against required period else: if ( 'frequency' in kwargs and ((kwargs['frequency']=="fx") or \ kwargs['frequency']=="seasonnal" or kwargs['frequency']=="annual_cycle" )) : # local data if remote_prefix and \ ( (l.find("${variable}")>=0) or variable=='*' or \ fileHasVar(f,variable) or (variable != altvar and fileHasVar(f,altvar)) ) : clogger.debug("adding fixed field :" + f) rep.append(f) # remote data elif remote_prefix is not "": if (l.split(":")[-1].find("${variable}")>=0) or variable=='*' or \ (variable != altvar and (f.find(altvar)>=0) ): clogger.debug("adding fixed field :" + remote_prefix + f) rep.append(remote_prefix + f) else: raise Climaf_Data_Error( "For remote files, filename pattern (%s) should include ${varname} (which is instanciated by variable name or filenameVar)" % f) else: clogger.info( "Cannot yet filter files re. time using only file content." ) rep.append(f) if (fperiod and period.intersects(fperiod)) or not regexp: clogger.debug( 'Period is OK - Considering variable filtering on %s and %s for %s' % (variable, altvar, f)) # Filter against variable if (l.find("${variable}") >= 0): clogger.debug( 'appending %s based on variable in filename' % f) rep.append(remote_prefix + f) continue if (f not in rep): # local data if not remote_prefix and \ (variable=='*' or "," in variable or fileHasVar(f,variable) or \ (altvar != variable and fileHasVar(f,altvar))) : # Should check time period in the file if not regexp clogger.debug( 'appending %s based on multi-var or var exists in file ' % f) rep.append(f) continue # remote data elif remote_prefix: if variable=='*' or "," in variable or \ (variable != altvar and (f.find(altvar)>=0) ): # Should check time period in the file if not regexp clogger.debug( 'appending %s based on multi-var or altvar ' % (remote_prefix + f)) rep.append(remote_prefix + f) continue else: mess = "For remote files, filename pattern (%s) should include" % ( remote_prefix + f) mess += " ${varname} (which is instanciated by variable name or filenameVar)" raise Climaf_Data_Error(mess) else: if not fperiod: clogger.debug('not appending %s because period is None ' % f) else: if not period.intersects(fperiod): clogger.debug( 'not appending %s because period doesn t intersect %s' % (f, period)) # Break on first url with any matching data if len(rep) > 0: clogger.debug('url %s does match for ' % l + ` kwargs `) break return rep