def periodOfEmFile(filename,realm,freq): """ Return the period covered by a file handled by EM, based on filename rules for EM. returns None if file frequency does not fit freq """ if (realm == 'A' or realm == 'L' ) : if freq=='mon' or freq=='' : year=re.sub(r'^.*([0-9]{4}).nc',r'\1',filename) if year.isdigit(): speriod="%s01-%s12"%(year,year) return init_period(speriod) else: raise classes.Climaf_Error("can yet handle only monthly frequency for realms A and L - TBD") elif (realm == 'O' or realm == 'I' ) : if freq=='monthly' or freq=='mon' or freq=='' : altfreq='m' elif freq[0:2] =='da' : altfreq='d' else: raise classes.Climaf_Error("Can yet handle only monthly and daily frequency for realms O and I - TBD") patt=r'^.*_1'+altfreq+r'_([0-9]{8})_*([0-9]{8}).*nc' beg=re.sub(patt,r'\1',filename) end=re.sub(patt,r'\2',filename) #clogger.debug("beg=%s,end=%s,fn=%s"%(beg,end,filename)) if (end==filename or beg==filename) : return None return init_period("%s-%s"%(beg,end)) else: raise classes.Climaf_Error("unexpected realm "+realm)
def glob_remote_data(remote, pattern): """ Returns a list of path names that match pattern, for remote data located atremote """ if len(remote.split(":")) == 3: k = 1 else: k = 0 k = 0 if re.findall("@", remote.split(":")[k]): username = remote.split(":")[k].split("@")[0] host = remote.split(":")[k].split("@")[-1] else: username = '' host = remote.split(":")[k] secrets = netrc.netrc() if username: if host in secrets.hosts: login, account, password = secrets.authenticators(host) if login != username: password = getpass.getpass( "Password for host '%s' and user '%s': " % (host, username)) else: password = getpass.getpass( "Password for host '%s' and user '%s': " % (host, username)) else: if host in secrets.hosts: username, account, password = secrets.authenticators(host) else: username = raw_input("Enter login for host '%s': " % host) password = getpass.getpass( "Password for host '%s' and user '%s': " % (host, username)) try: connect = ftp.FTP(host, username, password) listfiles = ftpmatch(connect, pattern) connect.quit() return listfiles except ftp.all_errors as err_ftp: print err_ftp raise classes.Climaf_Error( "Access problem for data %s on host '%s' and user '%s'" % (pattern, host, username))
def selectGenericFiles(urls, return_wildcards=None,merge_periods_on=None,**kwargs): """ Allow to describe a ``generic`` file organization : the list of files returned by this function is composed of files which : - match the patterns in ``url`` once these patterns are instantiated by the values in kwargs, and - contain the ``variable`` provided in kwargs - match the `period`` provided in kwargs In the pattern strings, no keyword is mandatory. However, for remote files, filename pattern must include ${varname}, which is instanciated by variable name or ``filenameVar`` (given via :py:func:`~climaf.classes.calias()`); this is for the sake of efficiency (please complain if inadequate) Example : >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*${PERIOD}*.nc)'] /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc In the pattern strings, the keywords that can be used in addition to the argument names (e.g. ${model}) are: - ${variable} : use it if the files are split by variable and filenames do include the variable name, as this speed up the search - ${PERIOD} : use it for indicating the period covered by each file, if this is applicable in the file naming; this period can appear in filenames as YYYY, YYYYMM, YYYYMMDD, YYYYMMDDHHMM, either once only, or twice with separator ='-' or '_' - wildcards '?' and '*' for matching respectively one and any number of characters """ def store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards, merge_periods_on=None, fperiod=None,periods=None,periods_dict=None): """" """ if fperiod is not None and periods is not None : clogger.debug('Adding period %s'%fperiod) periods.append(fperiod) # for kw in kwargs : it=re.finditer(facets_regexp,f) for oc in it : try : facet_value=oc.group(kw) except : continue if type(kwargs[kw]) is str and ("*" in kwargs[kw] or "?" in kwargs[kw] ): if facet_value is not None : if kw not in wildcards : wildcards[kw]=set() wildcards[kw].add(facet_value) clogger.debug("Discover %s=%s for file=%s"%(kw,facet_value,f)) else : clogger.debug("Logic issue for kw=%s and file=%s"%(kw,f)) # if fperiod is not None and periods is not None : if merge_periods_on is None : key=None elif kw == merge_periods_on : key=facet_value else : #print "Skipping for kw=%s,sort=%s"%(kw,merge_periods_on) continue if key not in periods_dict: periods_dict[key]=set() #print "adding period %s for key %s"%(fperiod,key) periods_dict[key].add(fperiod) else: pass #print "no Adding period for %s=%s for %s"%(kw,facet_value,f) #print "end of store, periods_dict=",periods_dict, "wild=",wildcards rep=[] # periods=None # a list of periods available periods_dict=dict() # period=kwargs['period'] ; if period == "*" : periods=[] # List of all periods elif type(period) is str : period=init_period(period) # variable=kwargs['variable'] altvar=kwargs.get('filenameVar',variable) # # dicts of date patterns, for globbing and for regexp # digit="[0-9]" date_glob_patt={ "${PERIOD}" : "*" } # an ordered list of dates keywords date_keywords=date_glob_patt.keys() ; date_keywords.sort(reverse=True) # annee="%s{4}"%digit mois="(01|02|03|04|05|06|07|08|09|10|11|12)" jour="([0-3][0-9])" heure="(00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23)" minutes="[0-5][0-9]" date="%s(%s(%s(%s(%s)?)?)?)?"%(annee,mois,jour,heure,minutes) rperiod="(?P<period>(?P<start>%s)([_-](?P<end>%s))?)"%(date,date) date_regexp_patt={ "${PERIOD}" : rperiod } # an ordered list of dates regexp keywords date_regexp_keywords=date_regexp_patt.keys() ; date_regexp_keywords.sort(reverse=True) # # for l in urls : # Instantiate keywords in pattern with attributes values remote_prefix="" ; if re.findall(".*:.*",l) : remote_prefix=':'.join(l.split(":")[0:-1])+':' basename=l.split(":")[-1] # This discard the remote_prefix if any basename=basename.replace("//","/") my_template=Template(basename) template=my_template.safe_substitute(**kwargs) #print "template after attributes replace : "+template # # Construct a pattern for globbing dates temp2=template for k in date_keywords : temp2=temp2.replace(k,date_glob_patt[k]) # Do globbing with plain varname if remote_prefix : lfiles=sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Remote globbing %d files for varname on %s : "%\ (len(lfiles),remote_prefix+temp2)) else: # local data lfiles=sorted(glob.glob(temp2)) clogger.debug("Before regexp filtering : Globbing %d files for varname on %s : "%(len(lfiles),temp2)) # Must filter with regexp, because * with glob is too inclusive alt=[] for f in lfiles : for k in date_keywords : if re.search(date_regexp_patt[k],f) : alt.append(f) continue lfiles=alt clogger.debug("Globbing %d files for varname on %s : "%(len(lfiles),temp2)) # # If unsuccessful using varname, try with filenameVar if len(lfiles)==0 and "filenameVar" in kwargs and kwargs['filenameVar'] : # Change value of facet 'variable' kwargs['variable']=kwargs['filenameVar'] template=my_template.safe_substitute(**kwargs) temp2=template for k in date_keywords : temp2=temp2.replace(k,date_glob_patt[k]) # # Do globbing with fileVarname if remote_prefix : # lfiles=sorted(glob_remote_data(remote_prefix, temp2)) clogger.debug("Remote globbing %d files for filenamevar on %s: "%\ (len(lfiles),remote_prefix+temp2)) else: # local data lfiles=sorted(glob.glob(temp2)) # Must filter with regexp, because * with glob is too inclusive alt=[] for f in lfiles : for k in date_keywords : if re.search(date_regexp_patt[k],f) : alt.append(f) continue lfiles=alt clogger.debug("Globbing %d files for filenamevar on %s: "%(len(lfiles),temp2)) # # For discovering values for those facets which are a wildcard, # construct a regexp with a group name for all facets (but period) alt_basename=basename.replace("?",".").replace("*",".*") alt_kwargs=kwargs.copy() for kw in kwargs : if type(kwargs[kw]) is str : # This excludes period attribute, which has a type alt_kwargs[kw]=kwargs[kw].replace("?",".").replace("*",".*") alt_basename=alt_basename.replace(r"${%s}"%kw,r"(?P<%s>%s)"%(kw,alt_kwargs[kw]),1) facets_regexp=Template(alt_basename).safe_substitute(**alt_kwargs) for k in date_regexp_keywords : facets_regexp=facets_regexp.replace(k,date_regexp_patt[k],1) facets_regexp=facets_regexp.replace(k,".*") wildcards=dict() #print "facets_regexp=",facets_regexp # # Construct regexp for extracting dates from filename date_regexp=None template_toreg=template.replace("*",".*").replace("?",r".").replace("+","\+") #print "template before searching dates : "+template_toreg for key in date_regexp_keywords : #print "searchin "+key+" in "+template start=template_toreg.find(key) if (start>=0 ) : date_regexp=template_toreg.replace(key,date_regexp_patt[key],1) #print "found ",key," dateregexp ->",date_regexp hasEnd=False start=date_regexp.find(key) #start=date_regexp.find(key) if (start >=0 ) : hasEnd=True date_regexp=date_regexp.replace(key,date_regexp_patt[key],1) #date_regexp=date_regexp.replace(key,date_regexp_patt[key],1) break #print "date_regexp before searching dates : "+date_regexp # for f in lfiles : #print "processing file "+f # # Extract file time period # fperiod=None if date_regexp : if "P<period>" in date_regexp : #print "date_rexgep=",date_regexp #print "f=",f #print "period=",re.sub(date_regexp,r'\g<period>',f) tperiod=re.sub(date_regexp,r'\g<period>',f) if tperiod==f : raise classes.Climaf_Error("Cannot find a period in %s with regexp %s"%(f,date_regexp)) fperiod=init_period(tperiod) else: date_regexp0=date_regexp #print "date_regexp for extracting dates : "+date_regexp0, "file="+f start=re.sub(date_regexp0,r'\1',f) if start==f: raise Climaf_Data_Error("Start period not found in %s using regexp %s"%(f,regexp0)) #? if hasEnd : end=re.sub(date_regexp0,r'\2',f) fperiod=init_period("%s-%s"%(start,end)) else : fperiod=init_period(start) #print "period for file %s is %s"%(f,fperiod) # # Filter file time period against required period else : if ( 'frequency' in kwargs and ((kwargs['frequency']=="fx") or \ kwargs['frequency']=="seasonnal" or kwargs['frequency']=="annual_cycle" )) : # local data if not remote_prefix and \ ( (basename.find("${variable}")>=0) or variable=='*' or \ fileHasVar(f,variable) or (variable != altvar and fileHasVar(f,altvar)) ) : clogger.debug("adding fixed field :"+f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(f) # remote data elif remote_prefix : if (basename.find("${variable}")>=0) or variable=='*' or \ (variable != altvar and (f.find(altvar)>=0) ): clogger.debug("adding fixed field :"+remote_prefix+f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(remote_prefix+f) else: raise classes.Climaf_Error( "For remote files, filename pattern (%s) should include ${varname} "+\ "(which is instanciated by variable name or filenameVar)"%f) else : clogger.info("Cannot yet filter files re. time using only file content.") store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on) rep.append(f) # # If file period matches requested period, check similarly for variable # #print "fperiod=",fperiod #print "periods=",periods #print "inter=",period.intersects(fperiod) #print "date_regexp=",date_regexp if (fperiod and ( periods is not None or period.intersects(fperiod) )) \ or not date_regexp : # clogger.debug('Period is OK - Considering variable filtering on %s and %s for %s'%(variable,altvar,f)) # Filter against variable if (l.find("${variable}")>=0): clogger.debug('appending %s based on variable in filename'%f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on, fperiod,periods,periods_dict) rep.append(remote_prefix+f) continue if (f not in rep): # local data if not remote_prefix and \ (variable=='*' or "," in variable or fileHasVar(f,variable) or \ (altvar != variable and fileHasVar(f,altvar))) : # Should check time period in the file if not date_regexp clogger.debug('appending %s based on multi-var or var exists in file '%f) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards,merge_periods_on, fperiod,periods,periods_dict) rep.append(f) continue # remote data elif remote_prefix : if variable=='*' or "," in variable or \ (variable != altvar and (f.find(altvar)>=0) ): # Should check time period in the file if not date_regexp clogger.debug('appending %s based on multi-var or altvar '%(remote_prefix+f)) store_wildcard_facet_values(f,facets_regexp, kwargs, wildcards, merge_periods_on, fperiod,periods,periods_dict) rep.append(remote_prefix+f) continue else: mess="For remote files, filename pattern (%s) should include"%(remote_prefix+f) mess+=" ${varname} (which is instanciated by variable name or filenameVar)" raise classes.Climaf_Error(mess) else: if not fperiod : clogger.debug('not appending %s because period is None '%f) elif not period.intersects(fperiod) : clogger.debug('not appending %s because period doesn t intersect %s'%(f,period)) else: clogger.debug('not appending %s for some other reason %s'%(f)) # Break on first url with any matching data if len(rep)>0 : clogger.debug('url %s does match for '%l + `kwargs`) break # For wildcard facets, discover facet values + checks for facet in wildcards: s=wildcards[facet] if return_wildcards is not None : if facet=="period" : #print "s=",s," periods_dict=",periods_dict for val in periods_dict : periods_dict[val]=sort_periods_list(list(periods_dict[val])) clogger.info("Attribute period='*' has values %s"%(periods_dict)) return_wildcards["period"]=periods_dict else: if len(s) == 1 : s=s.pop() clogger.info("Attribute %s='%s' has matching value '%s'"%(facet,kwargs[facet],s)) return_wildcards[facet]=s else: rep=list(s); rep.sort() return_wildcards[facet]=rep message="Attribute %s='%s' has multiple values : %s"%(facet,kwargs[facet],list(s)) if return_wildcards : clogger.info(message) else: clogger.error(message) s=return_wildcards[facet] else: clogger.debug("return_wildcards is None") return rep
def __init__(self, project="*", organization='generic', url=None, model="*", simulation="*", realm="*", table="*", frequency="*"): """ Create an entry in the data locations dictionary for an ensemble of datasets. Args: project (str,optional): project name model (str,optional): model name simulation (str,optional): simulation name frequency (str,optional): frequency organization (str): name of the organization type, among those handled by :py:func:`~climaf.dataloc.selectFiles` url (list of strings): list of URLS for the data root directories, local or remote Each entry in the dictionary allows to store : - a list of path or URLS (local or remote), which are root paths for finding some sets of datafiles which share a file organization scheme. - For remote data: url is supposed to be in the format 'protocol:user@host:path', but 'protocol' and 'user' are optional. So, url can also be 'user@host:path' or 'protocol:host:path' or 'host:path'. ftp is default protocol (and the only one which is yet managed, AMOF). If 'user' is given: - if 'host' is in $HOME/.netrc file, CliMAF check if corresponding 'login == 'user'. If it is, CliMAF get associated password; otherwise it will prompt the user for entering password; - if 'host' is not present in $HOME/.netrc file, CliMAF will prompt the user for entering password. If 'user' is not given: - if 'host' is in $HOME/.netrc file, CliMAF get corresponding 'login' as 'user' and also get associated password; - if 'host' is not present in $HOME/.netrc file, CliMAF prompt the user for entering 'user' and 'password'. Remark: The .netrc file contains login and password used by the auto-login process. It generally resides in the user's home directory ($HOME/.netrc). So, it is highly recommended to supply this information in .netrc file not to have to enter password in every request. Warning: python netrc module does not handle multiple entries for a single host. So, if netrc file has two entries for the same host, the netrc module only returns the last entry. We define two kinds of host: hosts with evolving files, e.g. 'beaufix'; and the others. For any file returned by function :py:meth:`~climaf.classes.cdataset.listfiles` which is found in cache: - in case of hosts with dynamic files, the file is transferred only if its date on server is more recent than that found in cache; - for other hosts, the file found in cache is used - the name for the corresponding data files organization scheme. The current set of known schemes is : - CMIP5_DRS : any datafile organized after the CMIP5 data reference syntax, such as on IPSL's Ciclad and CNRM's Lustre - EM : CNRM-CM post-processed outputs as organized using EM (please use a list of anyone string for arg urls) - generic : a data organization described by the user, using patterns such as described for :py:func:`~climaf.dataloc.selectGenericFiles`. This is the default Please ask the CliMAF dev team for implementing further organizations. It is quite quick for data which are on the filesystem. Organizations considered for future implementations are : - NetCDF model outputs as available during an ECLIS or ligIGCM simulation - ESGF - the set of attribute values which simulation's data are stored at that URLS and with that organization For remote files, filename pattern must include ${varname}, which is instanciated by variable name or filenameVar (given via :py:func:`~climaf.classes.calias()`), for the sake of efficiency. Please complain if this is inadequate For the sake of brievity, each attribute can have the '*' wildcard value; when using the dictionary, the most specific entries will be used (whic means : the entry (or entries) with the lowest number of wildcards) Example : - Declaring that all IPSLCM-Z-HR data for project PRE_CMIP6 are stored under a single root path and folllows organization named CMIP6_DRS:: >>> dataloc(project='PRE_CMIP6', model='IPSLCM-Z-HR', organization='CMIP6_DRS', url=['/prodigfs/esg/']) - and declaring an exception for one simulation (here, both location and organization are supposed to be different):: >>> dataloc(project='PRE_CMIP6', model='IPSLCM-Z-HR', simulation='my_exp', organization='EM', url=['~/tmp/my_exp_data']) - and declaring a project to access remote data (on multiple servers):: >>> cproject('MY_REMOTE_DATA', ('frequency', 'monthly'), separator='|') >>> dataloc(project='MY_REMOTE_DATA', organization='generic',url=['beaufix:/home/gmgec/mrgu/vignonl/*/${simulation}SFX${PERIOD}.nc', ... 'ftp:vignonl@hendrix:/home/vignonl/${model}/${variable}_1m_${PERIOD}_${model}.nc']), >>> calias('MY_REMOTE_DATA','tas','tas',filenameVar='2T') >>> tas=ds(project='MY_REMOTE_DATA', simulation='AMIPV6ALB2G', variable='tas', frequency='monthly', period='198101') Please refer to the :ref:`example section <examples>` of the documentation for an example with each organization scheme """ self.project=project self.model=model self.simulation=simulation self.frequency=frequency self.organization=organization if organization not in ['EM', 'CMIP5_DRS', 'generic' ] : raise classes.Climaf_Error("Cannot process organization "+organization) if (isinstance(url,list)) : self.urls=url else : if re.findall("^esgf://.*",url) : self.organization="ESGF" self.urls=[url] self.urls = map(os.path.expanduser,self.urls) alt=[] for u in self.urls : #if u[0] != '$' : alt.append(os.path.abspath(u)) #lv if u[0] != '$' and ':' not in u: alt.append(os.path.abspath(u)) else : alt.append(u) # Change all datedeb-datend patterns to ${PERIOD} for upward compatibility alt2=[] for u in alt : for pat in [ "YYYYMMDDHHMM", "YYYYMMDDHH", "YYYYMMDD", "YYYYMM", "YYYY" ] : u=u.replace(pat+"-"+pat,"${PERIOD}") u=u.replace(pat+"_"+pat,"${PERIOD}") u=u.replace(pat,"${PERIOD}") alt2.append(u) # self.urls=alt2 # Register new dataloc only if not already registered if not (any([ l == self for l in locs])) : locs.append(self)
def selectFiles(return_wildcards=None, merge_periods_on=None, **kwargs): """ Returns the shortest list of (local or remote) files which include the data for the list of (facet,value) pairs provided Method : - use datalocations indexed by :py:func:`~climaf.dataloc.dataloc` to identify data organization and data store urls for these (facet,value) pairs - check that data organization is as known one, i.e. is one of 'generic', CMIP5_DRS' or 'EM' - derive relevant filenames search function such as as : py:func:`~climaf.dataloc.selectCmip5DrsFiles` from data organization scheme - pass urls and relevant facet values to this filenames search function """ rep=[] project=kwargs['project'] simulation=kwargs['simulation'] if 'model' in kwargs : model=kwargs['model'] else : model="*" if 'frequency' in kwargs : frequency=kwargs['frequency'] else : frequency="*" ofu=getlocs(project=project, model=model, simulation=simulation, frequency=frequency) clogger.debug("locs="+ `ofu`) if ( len(ofu) == 0 ) : clogger.warning("no datalocation found for %s %s %s %s "%(project, model, simulation, frequency)) for org,freq,urls in ofu : if return_wildcards is not None and org is not "generic" : raise classes.Climaf_Error("Can hanle multipe facet query only for organization=generic ") kwargs2=kwargs.copy() # Convert normalized frequency to project-specific frequency if applicable if "frequency" in kwargs and project in classes.frequencies : normfreq=kwargs2['frequency'] if normfreq in classes.frequencies[project]: kwargs2['frequency']=classes.frequencies[project][normfreq] # JS # Convert normalized realm to project-specific realm if applicable if "realm" in kwargs and project in classes.realms : normrealm=kwargs2['realm'] if normrealm in classes.realms[project]: kwargs2['realm']=classes.realms[project][normrealm] # # Call organization-specific routine if (org == "EM") : rep.extend(selectEmFiles(**kwargs2)) elif (org == "CMIP5_DRS") : rep.extend(selectCmip5DrsFiles(urls,**kwargs2)) elif (org == "generic") : rep.extend(selectGenericFiles(urls, return_wildcards=return_wildcards, \ merge_periods_on=merge_periods_on,**kwargs2)) else : raise classes.Climaf_Error("Cannot process organization "+org+ \ " for simulation "+simulation+" and model "+model+\ " of project "+project) if (not ofu) : return None else : if (len(rep) == 0 ) : clogger.warning("no file found for %s, at these " "data locations %s "%(`kwargs` , `urls`)) if any([ kwargs[k] == '' for k in kwargs ]) : clogger.warning("Please check these empty attributes %s"%\ [ k for k in kwargs if kwargs[k]=='' ]) return None # Discard duplicates (assumes that sorting is harmless for later processing) rep.sort() last=None for f in rep : if f == last : rep.remove(last) last=f # Assemble filenames in one single string return(string.join(rep))