def call_sessionmaker(root): from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from esgcet.config import loadConfig, initLogging, registerHandlers # init_file = "../scripts/esg.ini" init_file = None # Load installed init file echoSql = True # Load the configuration and set up a database connection config = loadConfig(init_file) root.engine = create_engine(config.getdburl('extract'), echo=root.echoSql, pool_recycle=3600) initLogging('extract', override_sa=root.engine) Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() root.config = config root.Session = Session root.projectName = None root.firstFile = None root.dmap = None root.extraFields = None root.directoryMap = None root.datasetMapfile = None root.filefilt = None
def call_sessionmaker( root ): from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from esgcet.config import loadConfig, initLogging, registerHandlers # init_file = "../scripts/esg.ini" init_file = None # Load installed init file echoSql = True # Load the configuration and set up a database connection config = loadConfig(init_file) root.engine = create_engine(config.getdburl('extract'), echo=root.echoSql, pool_recycle=3600) initLogging('extract', override_sa=root.engine) Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() root.config = config root.Session = Session root.projectName = None root.firstFile = None root.dmap = None root.extraFields = None root.directoryMap = None root.datasetMapfile = None root.filefilt = None
def initdb(init_file=None, echoSql=False, log_filename=None): global dbengine config = loadConfig(init_file) if dbengine is None: dbengine = create_engine(config.getdburl("extract"), echo=echoSql, pool_recycle=3600) initLogging("extract", override_sa=dbengine, log_filename=log_filename) Session = sessionmaker(bind=dbengine, autoflush=True, autocommit=False) return config, Session
def main(argv): try: args, lastargs = getopt.getopt( argv, "", ['config-section=', 'echo', 'recursive=']) except getopt.error: print sys.exc_value print usage sys.exit(0) configSection = "srmls" echo = False recurse = True for flag, arg in args: if flag == '--config-section': configSection = arg elif flag == '--echo': echo = True elif flag == '--recursive': recurse = (arg.lower() == "yes") if len(lastargs) == 0: print "No directory specified." print usage sys.exit(0) if recurse: recurseOption = "-recursive" else: recurseOption = "" config = loadConfig(None) command = config.get(configSection, 'srmls') offline_proxy = config.get(configSection, 'srm_server') archive = config.get(configSection, 'srm_archive') srm_prefix = "%s?SFN=%s" % (offline_proxy, archive) echo_args = "-storageinfo %s -s '%s?SFN=%s%s'" % ( recurseOption, offline_proxy, archive, lastargs[0]) command_args = "-storageinfo %s -s %s?SFN=%s%s" % ( recurseOption, offline_proxy, archive, lastargs[0]) if echo: print '%s %s' % (command, echo_args) sys.exit(0) try: f = subprocess.Popen([command, command_args], stdout=subprocess.PIPE).stdout except: raise ESGPublishError( "Error running command '%s %s': check configuration option 'srmls'" % (command, command_args)) for path, size in SRMIterator(f, srm_prefix): print path, size f.close()
def initdb(init_file=None, echoSql=False, log_filename=None): global dbengine config = loadConfig(init_file) if dbengine is None: dbengine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('DEFAULT', override_sa=dbengine, log_filename=log_filename) Session = sessionmaker(bind=dbengine, autoflush=True, autocommit=False) return config, Session
def gc_mvgood( topdir, gcdir ): """second step of gc, move good files from /scratch/_gc/ to /scratch/.""" config = loadConfig(None) engine = sqlalchemy.create_engine(config.get('replication', 'esgcet_db'), echo=False, pool_recycle=3600) # os.walk isn't going to work very well. I would have to parse the path to identify # the abs_path, which encodes the facets and version of the dataset, etc. # It's easier to start with those pieces of the path, and stick them together... for gcdsdir in glob.glob( gcdir ): fac1dir = gcdsdir[ len(os.path.join(topdir,'scratch/_gc/')): ] # one choice of facets # ...gcdsdir is the root directories for the dataset now in .../scratch/_gc/... # Below this directory are ones for versions and variables, and possibly bad? directories # for files which failed a checksum. if fac1dir.endswith("withdrawn"): # leave this facet directory in _gc, all versions # rare, seen for LASG probably it's a name change done by hand continue versiondirs = os.listdir( gcdsdir ) # should be version directories e.g. v20120913/ for versd in versiondirs: verspath = os.path.join(gcdsdir,versd) if not os.path.isdir(verspath): continue if not check_versiondir( versd ): raise Exception("%s does not look like a version directory"%versd) vardirs = os.listdir(verspath) mvstatus = False # True if any file in this dataset+version should be moved # back to scratch/. for vard in vardirs: varpath = os.path.join(verspath,vard) dirpath = varpath if not os.path.isdir(varpath): continue filenames = os.listdir(varpath) # mostly files, may also have bad? directories for filename in filenames: filep = os.path.join(varpath,filename) if os.path.isfile(filep): abspath = os.path.join( fac1dir, versd, vard, filename ) mvstatus = mvstatus or mvgood2scratch( filename, abspath, dirpath, engine ) if mvstatus is True: # A file was moved back to scratch, others in the same dataset+version should be # moved. for vard in vardirs: varpath = os.path.join(verspath,vard) dirpath = varpath if not os.path.isdir(varpath): continue filenames = os.listdir(varpath) # mostly files, may also have bad? directories for filename in filenames: filep = os.path.join(varpath,filename) if os.path.isfile(filep): mv2scratch( filename, dirpath ) if os.path.isdir(filep) and filep.find('bad')==0: for filen in os.listdir(filep): if os.path.isfile(filen): mv2scratch( filename, dirpath )
def main(argv): try: args, lastargs = getopt.getopt(argv, "", ['config-section=', 'echo', 'recursive=']) except getopt.error: print sys.exc_value print usage sys.exit(0) configSection = "srmls" echo = False recurse = True for flag, arg in args: if flag=='--config-section': configSection = arg elif flag=='--echo': echo = True elif flag=='--recursive': recurse = (arg.lower()=="yes") if len(lastargs)==0: print "No directory specified." print usage sys.exit(0) if recurse: recurseOption = "-recursive" else: recurseOption = "" config = loadConfig(None) command = config.get(configSection, 'srmls') offline_proxy = config.get(configSection, 'srm_server') archive = config.get(configSection, 'srm_archive') srm_prefix = "%s?SFN=%s"%(offline_proxy, archive) echo_args = "-storageinfo %s -s '%s?SFN=%s%s'"%(recurseOption, offline_proxy, archive, lastargs[0]) command_args = "-storageinfo %s -s %s?SFN=%s%s"%(recurseOption, offline_proxy, archive, lastargs[0]) if echo: print '%s %s'%(command, echo_args) sys.exit(0) try: f = subprocess.Popen([command, command_args], stdout=subprocess.PIPE).stdout except: raise ESGPublishError("Error running command '%s %s': check configuration option 'srmls'"%(command, command_args)) for path, size in SRMIterator(f, srm_prefix): print path, size f.close()
def download_list( facets=facets_default, downloadlist="download-mon", tempfile="esg.tmp",\ reuse_tempfile=False, forcedl=False,\ statusfile=None, head='/cmip5/scratch', serviceurl="", limit=0 ): """generates a download list suitable for Estani's Download.py. Inputs are: facets, a dictionary of 9 facets which specify the datasets to be downloaded, (You can generate facets with the function dataset2facets) downloadlist, name of the file to write to, tmpfile, name of a temporary file (used for output from esgquery_index) (for debugging purposes you can set reuse_tempfile=True to use a file previously generated) forcedl, set True to force download even of files we already have statusfile, name of a file to which warnings and debugging information are written; defaults to stdout head, the first part of the download path (after which is the abs_path of the replica database) limit, the maximum number of files to process; 0 for no limit (mainly for testing) serviceurl, the argument of esgquery_index """ if statusfile==None: statusfile = sys.stdout else: statusfile = open(statusfile,'a+') # # Put the institute name is in its P2P form for esgquery_index. # For example, esgquery_index wants CCCMA, not CCCma; and sometimes a space rather than a hyphen. institute = facets['institute'] if institute in institute_p.keys(): institute = institute_p[institute] facets['institute'] = institute # Also fix the project name - it's CMIP5 for the P2P system, cmip5 for the gateway system. facets['project'] = facets['project'].upper() # same for cmip5/CMIP5 # Also, in the one case I know of where this is a problem, put the model in its P2P form: if facets['model']=='inmcm4': facets['model'] = 'INM-CM4' # The fields to get with esgquery: fields = facets.keys()+[ 'url', 'latest', 'variable', 'title', 'size', 'checksum', 'checksum_type' ] # ...note about versions: the version field (when you do esgquery_index on a file) is file_version, # not the dataset version which is normally meant by "version". We can get the dataset version by # extgracting the dataset_id field for the file, then calling # "esgquery_index -q id=<dataset_id> --fields version -p". # But it will be simpler to extract the dataset version from a file_id, as is done below. pp = pprint.PrettyPrinter(stream=statusfile) if not reuse_tempfile: ffiles = open(tempfile,'w') arg1= "-q "+ ','.join([ i+"='"+facets[i]+"'" for i in facets.keys() if facets[i] is not '%' ]) arg2="--fields "+','.join(fields)+" --type f -p" arg3 = "" # various options if len(serviceurl)>0: arg3 = arg3 + " --serviceurl "+serviceurl if limit>0: arg3 = arg3 + " --limit %d"%(limit) subprocess.call(['echo','esgquery_index',arg1,arg2,arg3],stdout=ffiles) print 'esgquery_index'+' '+arg1+' '+arg2+' '+arg3 subprocess.call(['esgquery_index'+' '+arg1+' '+arg2+' '+arg3],stdout=ffiles,shell=True) ffiles.close() ffiles = open(tempfile) dll = [] # each list entry will be a dictionary, download information for one file # Thus I'm assuming that esgquery_index is sorted by the file id. If this ever turns out not # to be true, I can sort it or make dll a dict. fileid = None for line in ffiles: cols=line.split('|') # len(cols)=1:header or footer. otherwise,usually: # cols[0]='',cols[1]=dataset.file,cols[2]=host,cols[3]=field,cols[4:len-1]=value,cols[len-1]=garbage # The original id (first column) is cols[1]+'.'+cols[2]. if len(cols)<6: continue # probably 6 columns is real; anything less means header or footer if cols[1]!=fileid: # first hit on this file. fileid = cols[1] fd = {'fileid':lstrip(rstrip(fileid))} dll.append(fd) field = lstrip(rstrip(cols[3])) fd[field] = [ lstrip(rstrip(val)) for val in cols[4:len(cols)-1] ] # pp.pprint(dll) fdllist = open(downloadlist,'w') dllist = [] for fd in dll: # Form this file's line in download list, first separately getting its fields (columns). # Don't bother if this isn't the latest version of the file: if fd['latest'][0]!='true': # print "file",fd['fileid'],"is not the latest" continue out0 = fd['url'][0] out2 = fd['size'][0] out3 = fd.get( 'checksum', ['DUMMY'] )[0] out4 = fd.get( 'checksum_type', ['md5'] )[0].lower() # When we write to a file, out1 will be the target path, of the form head/project/product/institute/... # But for now, out2 will be abs_path, of the form project/product/institute/... # ...institute/model/experiment/time_frequency/realm/cmor_table/ensemble/<version>/variable/title # To get the version, take advantage of the fact that the fileid always begins with the same fields, # project.product.institute.model.experiment.time_frequency.realm.cmor_table.ensemble.version.filename: version = fd['fileid'].split('.')[9] institute = fd['institute'][0] # Put the institute name is in its gateway form. if institute in institute_g.keys(): institute = institute_g[institute] model = fd['model'][0] if model=="INM-CM4" or model=="inm-cm4": model = "inmcm4" out1 = '/'.join( [ fd['project'][0].lower(), fd['product'][0], institute, fd['model'][0],\ fd['experiment'][0], fd['time_frequency'][0], fd['realm'][0], fd['cmor_table'][0],\ fd['ensemble'][0], version, fd['variable'][0], fd['title'][0] ] ) dllist.append([ out0, out1, out2, out3, out4 ]) # fdllist.write('\t'.join([ out0, head+'/'+out1, out2, out3, out4 ])+'\n') # At this point we have in dllist a download list, constructed from the output of esgquery_index # (i.e. the P2P system). # Now we shall to compare it with files known to the replication database. If we already have the # file, don't get it again. If we have the exact same file stored under a different version number # (very common), re-use it. If the file _isn't_ in the database, issue a warning and don't get it # (because we wouldn't be able to keep track of it after downloading it). config = loadConfig(None) engine = sqlalchemy.create_engine(config.get('replication', 'esgcet_db'), echo=False, pool_recycle=3600) # files0 is the files we want but already have, expresses as an abs_path; from the replication database. # Note: this could be merged with the check for older files - would save a database access # but increase code complexity a little. # Note: unfortunately, esgquery_index and postgres seem to do output sorting and hence limits # a little differently. dstr = facets2dataset(facets) if not forcedl: sql0 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"' AND status>=30;" files0 = engine.execute( sql.text( sql0 ) ).fetchall() files0 = [ f[0] for f in files0 ] # pp.pprint "From %s files0=\n"%(sql0) # pp.pprint(files0) # Of couse, we don't want files we already have, so take them out of the download list: # Making a set out of files0 should convert the deletion process from O(n^2) to O(n), with # bigger constants; I'm not sure whether it's worth it. Usually 1,000<n<100,000. # Also, the database and esgquery_index have different naming conventions, e.g. institute # CCCma/CCCMA so the comparison has to be made case-insensitive (at least; but I believe that # urls and file paths derived from the P2P system will hyphenate the same way). sfiles0 = set([f.lower() for f in files0]) ldllist0 = len(dllist) dllist = [ row for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') not in sfiles0 ] #...if there are any more mismatch cases not handled by lower(), then I'll have to do # a more complicated fix - break up into facets, subsitute with tables, then recombine. statusfile.write("dllist reduced from %d to %d lines by removing files we already have\n"%\ (ldllist0, len(dllist)) ) # files1 is the files we want and don't have; from the replication database. # It should correspond to the download list, but probably doesn't because they're based on different # harvests. # I don't want to deal with files which are missing from the database. Rather than try # to fix the database, we'll take them out of the download list too. if forcedl: sql1 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"';" else: sql1 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"' AND status<30;" print sql1 files1 = engine.execute( sql.text( sql1 ) ).fetchall() files1 = [ f[0] for f in files1 ] # pp.pprint "From %s files1=\n"%(sql0) # pp.pprint(files1) sfiles1 = set([f.lower() for f in files1]) ldllist0 = len(dllist) # pp.pprint(dllist) print [ row[1].lower().replace('inm-cm4','inmcm4') for row in dllist ][0] dllist2 = [ row for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') in sfiles1 ] #...if there are any more mismatch cases not handled by lower(), then I'll have to do # a more complicated fix - break up into facets, subsitute with tables, then recombine. statusfile.write(("dllist reduced from %d to %d lines by removing files not known to the replication"+\ " database.\n")%(ldllist0, len(dllist2)) ) if len(dllist2)<ldllist0: statusfile.write( "WARNING: This change discards the following download list files.\n" ) statusfile.write( "Maybe it's time for another harvest!\n" ) if statusfile!=sys.stdout: # don't write too much to the screen pp.pprint( [ row[1] for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') not in sfiles1 ] ) #...if there are any more mismatch cases not handled by lower(), then I'll have to do # a more complicated fix - break up into facets, subsitute with tables, then recombine. else: statusfile.write("(filenames not printed)\n") dllist = dllist2 # If there be no output limits, and the relevant harvests up-to-date, then there should be a # 1:1 correspondence between files1 and dllist (because no file should appear twice in either one). # So check for that. if limit<=0: if len(files1)!=len(dllist): statusfile.write("WARNING: esgquery and database produced different numbers of files to download!") statusfile.write(" esgquery: %d; database: %d\n"%( len(dllist), len(files1) ) ) if statusfile!=sys.stdout: print "WARNING: esgquery and database produced different numbers of files to download!",\ len(dllist), len(files1) # _Maybe_ these sorts will help in finding the row efficiently in the older-version search below; # this needs investigation if it matters. files1.sort( key=( lambda i: i.lower() ) ) dllist.sort( key=( lambda i: i[1].lower() ) ) # files2 is the same as files1 but with the SQL wildcard % in place of the version # example of the following: # f = cmip5/output1/CCCma/CanCM4/decadal2008/mon/atmos/Amon/r1i1p1/v20111027/cl/cl_etc.nc # fs=[cmip5,output1,CCCma,CanCM4,decadal2008,mon,atmos,Amon,r1i1p1,v20111027,cl,cl_etc.nc] # fs=[cmip5,output1,CCCma,CanCM4,decadal2008,mon,atmos,Amon,r1i1p1,%,cl,cl_etc.nc] # g = cmip5/output1/CCCma/CanCM4/decadal2008/mon/atmos/Amon/r1i1p1/%/cl/cl_etc.nc files2=files1[:] for i in range(len(files2)): fs = files1[i].split('/') fs[9] = '%' files2[i] = '/'.join(fs) # pp.pprint "From %s files2=\n"%(sql0) # pp.pprint(files2) # Now look for older versions of each file in dllist/files1/files2: nnomatch = 0 for i in range(len(files2)): fil2 = files2[i] fil1 = files1[i] sql2 = "SELECT abs_path,checksum,checksum_type FROM replica.files WHERE abs_path LIKE '"+fil2+\ "' AND status>=30;" hvf = engine.execute( sql.text( sql2 ) ).fetchall() # list of (abs_path,checksum,checksum_type) for fi in hvf: # If abs_path is in the download list, this is a file we have which is the same as a file we # want, other than version number. If the checksum matches, we don't have to download - # just copy from one version's directory to the new version's directory. # Of course, don't bother to do anything if the dllist already refers to another local copy. row = next((r for r in dllist if r[1]==fil1),None) # the row which matches fil1; None if no match # ...the above use of a generator expression will only search until the row is found. # Thanks to ionous blog: http://dev.ionous.net/2009/01/python-find-item-in-list.html # >>> I would like it to start at the previous match; that will usually get the next # >>> match in just one try if the lists are sorted first; look into this later. if row==None: # The database has a file, abs_path==fil1, which the P2P system (i.e. dllist) doesn't # know about. That is, the P2P and gateway systems are inconsistent with one another. # This shouldn't happen, but often does... if statusfile!=sys.stdout: # don't write too much to the screen! statusfile.write( "WARNING, can't find match for database file %s\n"%(fil1) ) nnomatch = nnomatch+1 continue statusfile.write( fil1+'\n' ) pp.pprint( row ) statusfile.write( fi[0]+'\n' ) if fi[1].upper()==row[3].upper() and fi[1].upper()!='DUMMY' and\ fi[2].lower()==row[4].lower() and row[0].find("file")!=0: # checksums match, aren't "DUMMY", so change the download url to do a local copy # >>>> for the moment, assume that we know where the file is.<<<< # >>>> later, check that it's here, and if not look other places... row[0] = "file://"+head+'/'+fi[0] statusfile.write("%d failures to find a P2P file matching a file needed by the database\n"%(nnomatch)) if statusfile!=sys.stdout: print "%d failures to find a P2P file matching a file needed by the database\n"%(nnomatch) for row in dllist: fdllist.write('\t'.join([ row[0], head+'/'+row[1], row[2], row[3], row[4] ])+'\n') ffiles.close() fdllist.close() if statusfile is not sys.stdout: statusfile.close()
def preoutput(argv): # This is what was formerly all of main() but the output section. global DEFAULT_QUERY_SERVICE try: args, lastargs = getopt.getopt( argv, "d:ho:pq:t:v", [ "count", "delimiter=", "facet-query=", "facets=", "fields=", "format=", "free-text=", "help", "limit=", "pretty-print", "service-url=", "type=", "verbose", ], ) except getopt.error: print sys.exc_value print usage sys.exit(0) # Get the search URL from the publisher configuration if possible try: from esgcet.config import loadConfig config = loadConfig(None) DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE) except: pass allFacets = False # facets=* countOnly = False delim = None facets = [] facetValues = None fields = [] format = DEFAULT_FORMAT freetext = None includeId = False objtype = DATASET offset = 0 outpath = sys.stdout outpathIsStdout = True prettyPrint = False service = DEFAULT_QUERY_SERVICE userLimit = MAX_RECORDS verbose = False for flag, arg in args: if flag == "--count": countOnly = True elif flag in ["-d", "--delimiter"]: delim = arg prettyPrint = False elif flag == "--facets": facetList = arg.split(",") facetValues = [item.strip() for item in facetList] allFacets = facetValues[0] == "*" elif flag == "--fields": fieldList = arg.split(",") fields = [item.strip() for item in fieldList] elif flag == "--format": if arg not in ["narrow", "wide"]: raise RuntimeError("Invalid format: %s" % arg) format = arg elif flag in ["-h", "--help"]: print usage sys.exit(0) elif flag == "--limit": try: userLimit = int(arg) except: raise RuntimeError("Invalid limit: %s" % arg) elif flag == "-o": outpath = open(arg, "w") outpathIsStdout = False elif flag in ["-p", "--pretty-print"]: prettyPrint = True elif flag in ["-q", "--facet-query"]: queries = arg.split(",") for q in queries: f, v = q.split("=") facets.append((f.strip(), v.strip())) elif flag == "--service-url": service = arg elif flag in ["-t", "--free-text"]: freetext = arg elif flag == "--type": try: objtype = typeCode[arg] except: raise RuntimeError("Invalid return type: %s" % arg) elif flag in ["-v", "--verbose"]: verbose = True # If returning id only, use wide format if fields == ["id"]: format = "wide" includeId = True # For facet value queries, use wide format. if facetValues is not None: format = "wide" # While remaining data: fullResults = [] numFound = 0 moredata = True nread = 0 nleft = userLimit offset = 0 chunksize = DEFAULT_CHUNKSIZE while moredata: # Formulate a query if not (countOnly or facetValues is not None): limit = min(nleft, chunksize) else: limit = 0 query = formulateQuery( facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues ) if verbose: print >> sys.stderr, "Query: ", query # Read a chunk chunk = readChunk(service, query) # Parse the response. For facet value searches, parse the response trailer if facetValues is None: results, numFound, numResults = parseResponse(chunk, includeId) fullResults.extend(results) else: numResults = 0 if allFacets: fullResults, numFound = parseHeader(chunk) else: fullResults, numFound = parseTrailer(chunk, facetValues, includeId) # More data if some results were found and the number of records read < total nread += numResults nleft -= limit moredata = (numResults > 0) and (nread < min(numFound, userLimit)) offset += limit ## print "jfp fullResults as list of (id,field,value) =" ## pprint.pprint(fullResults) ## results_ids = set([a for a,b,c in fullResults]) ## fullResults_dicts = [(a, { b:c for a,b,c in fullResults if a1==a }) for a1 in results_ids] ## print "jfp fullResults as list of (id,dict_of_field:value) =" ## pprint.pprint( fullResults_dicts ) # TO DO: return values which aren't used here, really should be computed in main return ( fullResults, countOnly, facetValues, allFacets, prettyPrint, delim, outpath, numFound, outpathIsStdout, format, )
def main(): """Uses the esg.ini file options: - thredds_file_services to get a Globus endpoint UUID - thredds_root to find a directory with THREDDS xml catalogs """ loadConfig(None) config = getConfig() if config is None: raise ESGPublishError('No configuration file found') # By default thredds_root is: /esg/content/thredds/esgcet thredds_root = config.get('DEFAULT', 'thredds_root') thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services') # parameters needed to re-harvest the THREDDS catalogs thredds_url = config.get('DEFAULT', 'thredds_url') hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile') hessian_service_url = config.get('DEFAULT', 'hessian_service_url') esgf_harvesting_service_url = hessian_service_url.replace('remote/secure/client-cert/hessian/publishingService','ws/harvest') thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..')) globus_base = None for service in thredds_file_services: if service[2] == 'Globus': globus_base = service[1] if globus_base is None: print 'No Globus file service specified in %s\n'\ 'Add Globus file service to the thredds_file_services variable in the form:\n'\ ' Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\ 'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI'] sys.exit(1) print '\n'\ 'ESGINI: %s\n'\ 'THREDDS root: %s\n'\ 'THREDDS url: %s\n'\ 'Globus service base: %s\n'\ 'ESGF harvesting service url: %s\n'\ 'X.509 user credential: %s\n'\ '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile) if not globus_base.endswith('/'): print 'Globus service base must end with "/". Set Globus service base correctly in\n'\ '%s end run the script again.' % os.environ['ESGINI'] sys.exit(1) print 'The script recursively goes through xml files in %s\n'\ 'looking for datasets that were published without Globus file service and adds\n'\ 'Globus access to the datasets. If a dataset was published with Globus file\n'\ 'service configured, the script skips such a dataset leaving a corresponding xml\n'\ 'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\ 'to harvest the updated xml files. Because Hessian service requires SSL\n'\ 'authentication, the X.509 certificate, %s,\n'\ 'should be valid and obtained by a user who has the publisher role in all\n'\ 'projects.\n'\ 'It is strongly advised that you make a copy of the entire %s\n'\ 'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up) while True: sys.stdout.write("Do you want to continue? [y/N]") line = sys.stdin.readline().rstrip() if line == '' or line == 'n' or line == 'N': sys.exit(0) if line == 'y' or line == 'Y': break process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
#!/usr/local/cdat/bin/python """Manages access to the replica DB""" import sqlalchemy from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, Float, String, sql, ForeignKey, orm import metaconfig import os import logging logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) #jfp copied from replica_manager.py: from esgcet.config import loadConfig, initLogging config = loadConfig(None) #Final destination of files (the archive). Typically this comes from ~/.esgcet/esg.ini archive_root0 = config.get('replication', 'archive_root0') # on gdo2: /cmip5/data archive_root1 = config.get('replication', 'archive_root1') # on gdo2: /css01-cmip5/data archive_root2 = config.get('replication', 'archive_root2') # on gdo2: /css02-cmip5/data #temporal destinations of files and other data while completing the datasets replica_root0 = config.get('replication', 'replica_root0') # on gdo2: /cmip5/scratch replica_root1 = config.get('replication', 'replica_root1') # on gdo2: /css01-cmip5/scratch replica_root2 = config.get('replication', 'replica_root2') # on gdo2: /css02-cmip5/scratch # not used: map_dir= os.path.join(replica_root, 'map') #jfp was files_dir= os.path.join(replica_root, 'files') files_dir0 = replica_root0 # on gdo2: /cmip5/scratch files_dir1 = replica_root1 # on gdo2: /css01-cmip5/scratch files_dir2 = replica_root2 # on gdo2: /css02-cmip5/scratch def __get_config(section, key): try:
def main(argv): try: args, lastargs = getopt.getopt(argv, "", ['config-section=', 'echo', 'help', 'recursive=']) except getopt.error: print sys.exc_value print usage sys.exit(0) configSection = "msls" echo = False recurse = True for flag, arg in args: if flag=='--config-section': configSection = arg elif flag=='--echo': echo = True elif flag=='--help': print usage sys.exit(0) elif flag=='--recursive': recurse = (arg.lower()=="yes") if len(lastargs)==0: print "No directory specified." print usage sys.exit(0) if recurse: recurseOption = "R" else: recurseOption = "" config = loadConfig(None) command = config.get(configSection, 'msls') path = lastargs[0] command_args = "-l%s"%recurseOption if echo: print '%s %s %s'%(command, command_args, path) sys.exit(0) try: errout = subprocess.Popen([command, command_args, path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout except: raise ESGPublishError("Error running command '%s %s': check configuration option 'msls'"%(command, command_args)) lines = errout.readlines() errout.close() directory = path for line in lines: line = line.strip() # Skip blank lines if len(line)==0: continue # File elif line[0]=='-': fields = line.split() fullpath = os.path.join(directory, fields[-1]) print fullpath, fields[4] # Directory elif line[0]=='/' and line[-1]==':': directory = line[:-1] # Error elif line[0]=='/': raise ESGPublishError("Error: %s"%line) # Skip else: continue
def getData(inpath, extra_metadata): # pdb.set_trace() f = cdms2.open(inpath) # get a handle to the main module so we can call his routines main = sys.modules['__main__'] # load info from esg.ini cfg = loadConfig("esg.ini") # load info from this run's transient config x = SaneConfigParser({}) x.read(extra_metadata) # pdb.set_trace() output = dif_switch() for mode in ['thredds_aggregation_services', 'thredds_file_services', 'thredds_offline_services']: s = splitRecord(cfg.get('DEFAULT', mode))[0] iform(output, 'service/') iform(output, 'serviceType="%s"' % s[0]) iform(output, 'base="%s"' % s[1]) iform(output, 'name="%s"' % s[2]) iform(output, 'desc="%s"' % DEFAULT_THREDDS_SERVICE_DESCRIPTIONS[s[0]]) iform(output, 'property/') iform(output, 'name="requires_authorization"') iform(output, 'value="%s"' % DEFAULT_THREDDS_SERVICE_AUTH_REQUIRED[s[0]], -1) for app in DEFAULT_THREDDS_SERVICE_APPLICATIONS[s[0]]: iform(output, ' property/') iform(output, 'name="application"') iform(output, 'value="%s"' % app, -1) iform(output, "", 0) iform(output, 'property/') iform(output, 'name="catalog_version"') iform(output, 'value="2"', -1) iform(output, 'dataset/') iform(output, 'restrictAccess="esg-user"') project = x.get('DEFAULT', 'project') iform(output, "ID=%s" % safe_quote(safe_interpolate(cfg, 'project:' + project, 'dataset_id', x))) iform(output, "name=%s" % safe_quote(safe_interpolate(cfg, 'project:' + project, 'dataset_name_format', x))) for name, value in x.items('DEFAULT'): iform(output, 'property/') iform(output, 'name="%s"' % name) iform(output, 'value="%s"' % value, -1) iform(output, 'metadata/') iform(output, 'variables/') # pdb.set_trace() for v in f.variables.keys(): iform(output, 'variable/') iform(output, 'name="%s"' % safe_getattr(f.variables[v], "id", "name")) vname = vocabulary_name(f.variables[v]) iform(output, 'vocabulary_name="%s"' % vname) iform(output, 'units="%s"' % safe_getattr(f.variables[v], "units")) iform(output, safe_getattr(f.variables[v], "long_name"), -1) for v in f.axes.keys(): iform(output, 'variable/') iform(output, 'name="%s"' % v) vname = vocabulary_name(f.axes[v]) iform(output, 'vocabulary_name="%s"' % vname) iform(output, 'units="%s"' % safe_getattr(f.axes[v], "units")) iform(output, safe_getattr(f.axes[v], "long_name"), -1) # pdb.set_trace() try: n = output.name if n == '<stdout>': rval = None else: rval = 'file:' + n except: rval = 'string:' + output.getvalue() return rval
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def main(argv): try: args, lastargs = getopt.getopt( argv, "", ['config-section=', 'echo', 'help', 'recursive=']) except getopt.error: print sys.exc_value print usage sys.exit(0) configSection = "hsi" echo = False recurse = True for flag, arg in args: if flag == '--config-section': configSection = arg elif flag == '--echo': echo = True elif flag == '--help': print usage sys.exit(0) elif flag == '--recursive': recurse = (arg.lower() == "yes") if len(lastargs) == 0: print "No directory specified." print usage sys.exit(0) if recurse: recurseOption = "R" else: recurseOption = "" config = loadConfig(None) command = config.get(configSection, 'hsi') path = lastargs[0] command_args = "ls -1s%s %s" % (recurseOption, path) if echo: print '%s %s' % (command, command_args) sys.exit(0) try: errout = subprocess.Popen([command, command_args], stderr=subprocess.PIPE).stderr except: raise ESGPublishError( "Error running command '%s %s': check configuration option 'hsi'" % (command, command_args)) lines = errout.readlines() errout.close() printit = False for line in lines: if printit: if line[0] == '*': raise Exception("Error accessing %s: %s" % (path, line)) if line[0] != '-': fields = line.split() print fields[1], fields[0] else: printit = (line[0:8] == "Username")
def main(argv): try: args, lastargs = getopt.getopt(argv, "hi:", [ 'database-delete', 'database-only', 'echo-sql', 'map=', 'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index', 'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds', 'use-list=' ]) except getopt.error: print sys.exc_value return deleteAll = False datasetMap = None deleteDset = False unpublishOnGateway = False echoSql = False init_file = None gatewayOp = DELETE las = False log_filename = None republish = True restApi = None thredds = True syncThredds = False useList = False threddsReinit = True for flag, arg in args: if flag == '--database-delete': deleteDset = True elif flag == '--database-only': gatewayOp = NO_OPERATION thredds = False deleteDset = True elif flag == '--echo-sql': echoSql = True elif flag in ['-h', '--help']: return elif flag == '-i': init_file = arg elif flag == '--map': datasetMap = readDatasetMap(arg) elif flag == '--skip-gateway': gatewayOp = NO_OPERATION elif flag == '--skip-index': gatewayOp = NO_OPERATION elif flag == '--las': las = True elif flag == '--log': log_filename = arg elif flag == '--no-republish': republish = False elif flag == '--no-thredds-reinit': threddsReinit = False elif flag == '--rest-api': restApi = True elif flag == '--skip-thredds': thredds = False elif flag == '--sync-thredds': syncThredds = True elif flag == '--use-list': useList = True useListPath = arg if gatewayOp != NO_OPERATION and unpublishOnGateway: gatewayOp = UNPUBLISH # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) if config is None: raise ESGPublishError("No configuration file found.") threddsRoot = config.get('DEFAULT', 'thredds_root') # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) if datasetMap is None: if not useList: datasetNames = [parseDatasetVersionId(item) for item in lastargs] else: if useListPath == '-': namelist = sys.stdin else: namelist = open(useListPath) datasetNames = [] for line in namelist.readlines(): versionId = parseDatasetVersionId(line.strip()) datasetNames.append(versionId) else: datasetNames = datasetMap.keys() datasetNames.sort() result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll, republish=republish, reinitThredds=threddsReinit, restInterface=restApi) # Republish previous versions as needed. This will happen if the latest version # was deleted from the database, and is not # the only version. In this case the previous version will be rescanned to generate the aggregations. if republish: statusDict, republishList = result if len(republishList) > 0: # Register project handlers. registerHandlers() info("Republishing modified datasets:") republishDatasetNames = [ generateDatasetVersionId(dsetTuple) for dsetTuple in republishList ] dmap, offline = queryDatasetMap(republishDatasetNames, Session) datasetNames = dmap.keys() datasets = iterateOverDatasets(None, dmap, None, republishList, Session, "time", UPDATE_OP, None, {}, offline, {}, forceAggregate=True) republishOp = (gatewayOp != NO_OPERATION ) # Don't republish if skipping the gateway op result = publishDatasetList(datasetNames, Session, publish=republishOp, thredds=thredds) # Synchronize database and THREDDS catalogs if syncThredds: threddsRoot = config.get('DEFAULT', 'thredds_root') # Make a dictionary of catalogs from the database session = Session() subcatalogs = session.query(Catalog).select_from( join(Catalog, Dataset, Catalog.dataset_name == Dataset.name)).all() catdict = {} for catalog in subcatalogs: location = os.path.join(threddsRoot, catalog.location) catdict[location] = 1 session.close() # Scan all XML files in the threddsroot os.path.walk(threddsRoot, cleanupCatalogs, catdict)
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line
def main(argv): global DEFAULT_QUERY_SERVICE global DEFAULT_WGET_SERVICE try: args, lastargs = getopt.getopt(argv, "d:ho:pq:t:v", [ 'count', 'delimiter=', 'facet-query=', 'facets=', 'fields=', 'format=', 'free-text=', 'help', 'limit=', 'offset=', 'pretty-print', 'service-url=', 'type=', 'verbose' ]) except getopt.error: print sys.exc_value print usage sys.exit(0) # Get the search URL from the publisher configuration if possible try: from esgcet.config import loadConfig config = loadConfig(None) DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE) DEFAULT_WGET_SERVICE = config.get("DEFAULT", "solr_wget_service_url", default=DEFAULT_WGET_SERVICE) except: pass allFacets = False # facets=* countOnly = False delim = None facets = [] facetValues = None fields = [] format = DEFAULT_FORMAT freetext = None includeId = False objtype = DATASET offset = 0 outpath = sys.stdout outpathIsStdout = True prettyPrint = False service = DEFAULT_QUERY_SERVICE userLimit = MAX_RECORDS verbose = False wgetScript = False wgetService = DEFAULT_WGET_SERVICE for flag, arg in args: if flag == '--count': countOnly = True elif flag in ['-d', '--delimiter']: delim = arg prettyPrint = False elif flag == '--facets': facetList = arg.split(',') facetValues = [item.strip() for item in facetList] allFacets = (facetValues[0] == '*') elif flag == '--fields': fieldList = arg.split(',') fields = [item.strip() for item in fieldList] elif flag == '--format': if arg not in ['narrow', 'wide', 'wget', 'xml']: raise RuntimeError("Invalid format: %s" % arg) format = arg if arg == 'wget': wgetScript = True elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag == '--limit': try: userLimit = int(arg) except: raise RuntimeError("Invalid limit: %s" % arg) elif flag == '-o': outpath = open(arg, 'w') outpathIsStdout = False elif flag in ['-p', '--pretty-print']: prettyPrint = True elif flag == '--offset': offset = int(arg) elif flag in ['-q', '--facet-query']: queries = arg.split(',') for q in queries: f, v = q.split('=') facets.append((f.strip(), v.strip())) elif flag == '--service-url': service = arg wgetService = arg elif flag in ['-t', '--free-text']: freetext = arg elif flag == '--type': try: objtype = typeCode[arg] except: raise RuntimeError("Invalid return type: %s" % arg) elif flag in ['-v', '--verbose']: verbose = True # If a wget script is requested: # - Object type is File # - fields are url # - limit is min(1000, limit) if wgetScript: objtype = FILE fields = ['url'] userLimit = min(userLimit, WGET_MAX_RECORDS) countOnly = False allFacets = False facetValues = None query = formulateQuery(facets, fields, format, freetext, objtype, wgetService, offset, userLimit, facetValues=facetValues) if verbose: print >> sys.stderr, 'Query: ', query downloadResult(query, outpath, outpathIsStdout) return # XML output if format == 'xml': userLimit = min(userLimit, DEFAULT_CHUNKSIZE) query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, userLimit, facetValues=facetValues) if verbose: print >> sys.stderr, 'Query: ', query downloadResult(query, outpath, outpathIsStdout) return # If returning id only, use wide format if fields == ['id']: format = 'wide' includeId = True # For facet value queries, use wide format. if facetValues is not None: format = 'wide' # While remaining data: fullResults = [] numFound = 0 moredata = True nread = 0 nleft = userLimit chunksize = DEFAULT_CHUNKSIZE while moredata: # Formulate a query if not (countOnly or facetValues is not None): limit = min(nleft, chunksize) else: limit = 0 query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues) if verbose: print >> sys.stderr, 'Query: ', query # Read a chunk chunk = readChunk(service, query) # Parse the response. For facet value searches, parse the response trailer if facetValues is None: scoreInFields = ('score' in fields) results, numFound, numResults = parseResponse( chunk, includeId, scoreInFields) fullResults.extend(results) else: numResults = 0 if allFacets: fullResults, numFound = parseHeader(chunk) else: fullResults, numFound = parseTrailer(chunk, facetValues, includeId) # More data if some results were found and the number of records read < total nread += numResults nleft -= limit moredata = (numResults > 0) and (nread < min(numFound, userLimit)) offset += limit # Output the results if not (countOnly or facetValues is not None): outputResults(fullResults, format, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath) elif facetValues is not None: for valueList, head in zip(fullResults[0], fullResults[1]): if allFacets: header = (head, ) else: header = (head, 'count') outputFacetResults(valueList, header, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath) else: print numFound if not outpathIsStdout: outpath.close()
def main(argv): try: args, lastargs = getopt.getopt( argv, "", ['config-section=', 'echo', 'help', 'recursive=']) except getopt.error: print sys.exc_value print usage sys.exit(0) configSection = "msls" echo = False recurse = True for flag, arg in args: if flag == '--config-section': configSection = arg elif flag == '--echo': echo = True elif flag == '--help': print usage sys.exit(0) elif flag == '--recursive': recurse = (arg.lower() == "yes") if len(lastargs) == 0: print "No directory specified." print usage sys.exit(0) if recurse: recurseOption = "R" else: recurseOption = "" config = loadConfig(None) command = config.get(configSection, 'msls') path = lastargs[0] command_args = "-l%s" % recurseOption if echo: print '%s %s %s' % (command, command_args, path) sys.exit(0) try: errout = subprocess.Popen([command, command_args, path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout except: raise ESGPublishError( "Error running command '%s %s': check configuration option 'msls'" % (command, command_args)) lines = errout.readlines() errout.close() directory = path for line in lines: line = line.strip() # Skip blank lines if len(line) == 0: continue # File elif line[0] == '-': fields = line.split() fullpath = os.path.join(directory, fields[-1]) print fullpath, fields[4] # Directory elif line[0] == '/' and line[-1] == ':': directory = line[:-1] # Error elif line[0] == '/': raise ESGPublishError("Error: %s" % line) # Skip else: continue
def main(): """Uses the esg.ini file options: - thredds_file_services to get a Globus endpoint UUID - thredds_root to find a directory with THREDDS xml catalogs """ loadConfig(None) config = getConfig() if config is None: raise ESGPublishError('No configuration file found') # By default thredds_root is: /esg/content/thredds/esgcet thredds_root = config.get('DEFAULT', 'thredds_root') thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services') # parameters needed to re-harvest the THREDDS catalogs thredds_url = config.get('DEFAULT', 'thredds_url') hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile') hessian_service_url = config.get('DEFAULT', 'hessian_service_url') esgf_harvesting_service_url = hessian_service_url.replace( 'remote/secure/client-cert/hessian/publishingService', 'ws/harvest') thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..')) globus_base = None for service in thredds_file_services: if service[2] == 'Globus': globus_base = service[1] if globus_base is None: print 'No Globus file service specified in %s\n'\ 'Add Globus file service to the thredds_file_services variable in the form:\n'\ ' Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\ 'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI'] sys.exit(1) print '\n'\ 'ESGINI: %s\n'\ 'THREDDS root: %s\n'\ 'THREDDS url: %s\n'\ 'Globus service base: %s\n'\ 'ESGF harvesting service url: %s\n'\ 'X.509 user credential: %s\n'\ '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile) if not globus_base.endswith('/'): print 'Globus service base must end with "/". Set Globus service base correctly in\n'\ '%s end run the script again.' % os.environ['ESGINI'] sys.exit(1) print 'The script recursively goes through xml files in %s\n'\ 'looking for datasets that were published without Globus file service and adds\n'\ 'Globus access to the datasets. If a dataset was published with Globus file\n'\ 'service configured, the script skips such a dataset leaving a corresponding xml\n'\ 'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\ 'to harvest the updated xml files. Because Hessian service requires SSL\n'\ 'authentication, the X.509 certificate, %s,\n'\ 'should be valid and obtained by a user who has the publisher role in all\n'\ 'projects.\n'\ 'It is strongly advised that you make a copy of the entire %s\n'\ 'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up) while True: sys.stdout.write("Do you want to continue? [y/N]") line = sys.stdin.readline().rstrip() if line == '' or line == 'n' or line == 'N': sys.exit(0) if line == 'y' or line == 'Y': break process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
def main(argv): global DB_Dict, TDS_Dict, Ref_XML_Errors, VERBOSE, SOLR_HTTP, rprt_fl, DBNoneValueURL, DB_Dict_Redund try: opts, args = getopt.getopt(argv, "ahve:m:p:l:", ['all', 'help', 'verbose', 'DT', 'SD', 'ST', 'log=', 'project=', 'model=', 'experiment=']) except getopt.GetoptError: print sys.exc_value print usage sys.exit(1) CMP_FLG = 0 Task_Message = "comparison all metadata sources: DB, TDS, Solr" proj_cnstr = None model_cnstr = None exper_cnstr = None ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') fl_stmp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d.%H:%M:%S') rprt_file_name = "meta_synchro." + fl_stmp + ".log" constr_categ = ["project","model","experiment"] if len(args) > 0: print "Wrong argumnet: ", args[0] print usage exit(1) if len(opts) == 0: print usage exit(0) for opt, arg in opts: if opt in ['-a', '--all']: proj_cnstr = None model_cnstr = None exper_cnstr = None elif opt in ['--DT']: CMP_FLG = 1 Task_Message = "comparison DB <-> TDS" elif opt in ['--SD']: CMP_FLG = 2 Task_Message = "comparison Solr <-> DB" elif opt in ['--ST']: CMP_FLG = 3 Task_Message = "comparison Solr <-> TDS" elif opt in ['-p', '--project']: proj_cnstr = arg elif opt in ['-m', '--model']: model_cnstr = arg elif opt in ['-e', '--experiment']: exper_cnstr = arg elif opt in ['-l', '--log']: rprt_file_name = arg elif opt in ['-v', '--verbose']: VERBOSE = True elif opt in ['-h', '--help']: print help_message exit(0) else: print "Wrong option: ", opt print usage exit(1) rprt_fl = open(rprt_file_name,'w') rprt_fl.write(st) config = loadConfig(init_dir) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) session = Session() # TDS_Dict = {ds_name:xml_ref_file} # DB_Dict = {ds_id:ds_name} # Solr_DS_Lst = [ds] # DB_TDS_DIFF_DS = {"dataset_name ds_id":["Y", "N"], "dataset_name xml_rel_path":["N", "Y"]} # e.g.: {"cmip5.output1.NOAA-GFDL.GFDL-ESM2M.1pctCO2.mon.atmos.Amon.r1i1p2.v20130214 51" ["N","Y"], # "cmip5.output1.NOAA-GFDL.GFDL-ESM2M.abrupt4xCO2.day.atmos.day.r1i1p1 5213" ["Y", "N"], ...} # DB_TDS_Shared_DS_Dict = {"DB_dataset_id rel_xmlpath" : TDS ref_xml} DB_DS_URLS = {} DB_TDS_DIFF_DS = {} Solr_Only_Files_Dict = {} DB_Only_Files_Dict = {} TDS_Only_Files_Dict = {} constr = (proj_cnstr, model_cnstr, exper_cnstr) if proj_cnstr is None and model_cnstr is None and exper_cnstr is None: cnstr_msg = "No constraints: all datasets are going to be verified." else: cnstr_msg = "\nConstraints choosen: " for i in range(len(constr)): if constr[i] is not None: cnstr_msg = cnstr_msg + "\n\t" + constr_categ[i] +": " + constr[i] print st print "\n=================> Started ", Task_Message print "report ->", rprt_file_name print cnstr_msg rprt_fl.write("\n\n"+Task_Message) rprt_fl.write("\n"+cnstr_msg+"\n") if CMP_FLG != 3: # PostgreSQL (DB_Dict, DB_Dict_Redund) = getDBDatasetDict(session, constr) print "\n=================> DB Dictionary (size= " + str(len(DB_Dict)) +")" if VERBOSE: print "\n".join((str(k) + " : " + DB_Dict[k]) for k in DB_Dict.keys()) rprt_fl.write("\n\n=================> DB Datasets (size= " + str(len(DB_Dict)) +")\n") if VERBOSE: rprt_fl.write( "\n".join((str(k) + " : " + DB_Dict[k]) for k in DB_Dict.keys()) ) if len(DB_Dict_Redund)>0: rprt_fl.write("=====> DB table 'dataset' contains records (" + str(len(DB_Dict_Redund)) +\ ") with different ids but the same dataset name/version: =====") rprt_fl.write( "\n".join( (k + " " + DB_Dict_Redund[k]) for k in DB_Dict_Redund.keys() ) ) if CMP_FLG != 2: # THREDDS (TDS_Dict, TDS_Dict_Redund, TDS_Dict_Broken) = getTDSDatasetDict(config, constr) print "\n=================> TDS Dictionary (size= " + str(len(TDS_Dict)) +")" rprt_fl.write("\n\n=================> TDS Datasets (size= " + str(len(TDS_Dict)) +")\n") if VERBOSE: print "\n".join((k + " : " + TDS_Dict[k]) for k in TDS_Dict.keys()) rprt_fl.write( "\n".join((k + " : " + TDS_Dict[k]) for k in TDS_Dict.keys()) ) if len(TDS_Dict_Redund)>1: rprt_fl.write("=====> TDS_Dict_Redund: (the same dataset names are listed multple times in main TDS catalog: =====>\n") rprt_fl.write("\n".join( (k + " : " + TDS_Dict_Redund[k]) for k in TDS_Dict_Redund.keys() ) ) if len(TDS_Dict_Broken)>0: rprt_fl.write("=====> TDS_dataset names do not correspond to reference catalog xmls in main catalog. There are " + str (len(TDS_Dict_Broken)) + " records: =====>\n") rprt_fl.write("\n".join( (k + " " + TDS_Dict_Broken[k]) for k in TDS_Dict_Broken.keys() ) ) if CMP_FLG != 1: # Solr (Solr_MultiVersion_DS, Solr_DS_Lst) = getSolrDatasetList(constr) print "\n=================> Solr Dataset List (size= " + str(len(Solr_DS_Lst)) + ")" if VERBOSE: print "\n".join(ds for ds in Solr_DS_Lst) rprt_fl.write("\n\n=================> Solr Datasets (size= " + str(len(Solr_DS_Lst)) + ")\n") if VERBOSE: rprt_fl.write("\n".join(ds for ds in Solr_DS_Lst)) if len(Solr_MultiVersion_DS) > 0: rprt_fl.write(" \n\n =======> These Datasets are reperesented in Solr in multiversions (only last version is used in comparison): =====>\n") rprt_fl.write("\n".join( (ds + " : [" + ", ".join(v for v in Solr_MultiVersion_DS[ds]) + "]") for ds in Solr_MultiVersion_DS.keys())) if (CMP_FLG == 1 or CMP_FLG == 0) and (len(DB_Dict)>0 or len(TDS_Dict)>0): DB_THREDDS_Comparison(DB_Dict,TDS_Dict, config, session) if (CMP_FLG == 2 or CMP_FLG == 0) and (len(DB_Dict)>0 or len(Solr_DS_Lst)>0): SOLR_DB_Comparison(Solr_DS_Lst, DB_Dict, SOLR_HTTP, session) if (CMP_FLG == 3 or CMP_FLG == 0) and (len(TDS_Dict)>0 or len(Solr_DS_Lst)>0): SOLR_TDS_Comparison(Solr_DS_Lst, TDS_Dict, SOLR_HTTP, config) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print "\n",st rprt_fl.write("\n"+st) rprt_fl.close() session.close()
def main(argv): global DEFAULT_QUERY_SERVICE global DEFAULT_WGET_SERVICE try: args, lastargs = getopt.getopt(argv, "d:ho:pq:t:v", ['count', 'delimiter=', 'facet-query=', 'facets=', 'fields=', 'format=', 'free-text=', 'help', 'limit=', 'offset=', 'pretty-print', 'service-url=', 'type=', 'verbose']) except getopt.error: print sys.exc_value print usage sys.exit(0) # Get the search URL from the publisher configuration if possible try: from esgcet.config import loadConfig config = loadConfig(None) DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE) DEFAULT_WGET_SERVICE = config.get("DEFAULT", "solr_wget_service_url", default=DEFAULT_WGET_SERVICE) except: pass allFacets = False # facets=* countOnly = False delim = None facets = [] facetValues = None fields = [] format = DEFAULT_FORMAT freetext = None includeId = False objtype = DATASET offset = 0 outpath = sys.stdout outpathIsStdout = True prettyPrint = False service = DEFAULT_QUERY_SERVICE userLimit = MAX_RECORDS verbose = False wgetScript = False wgetService = DEFAULT_WGET_SERVICE for flag, arg in args: if flag=='--count': countOnly = True elif flag in ['-d', '--delimiter']: delim = arg prettyPrint = False elif flag=='--facets': facetList = arg.split(',') facetValues = [item.strip() for item in facetList] allFacets = (facetValues[0]=='*') elif flag=='--fields': fieldList = arg.split(',') fields = [item.strip() for item in fieldList] elif flag=='--format': if arg not in ['narrow', 'wide', 'wget', 'xml']: raise RuntimeError("Invalid format: %s"%arg) format = arg if arg=='wget': wgetScript = True elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='--limit': try: userLimit = int(arg) except: raise RuntimeError("Invalid limit: %s"%arg) elif flag=='-o': outpath = open(arg, 'w') outpathIsStdout = False elif flag in ['-p', '--pretty-print']: prettyPrint = True elif flag=='--offset': offset = int(arg) elif flag in ['-q', '--facet-query']: queries = arg.split(',') for q in queries: f,v = q.split('=') facets.append((f.strip(), v.strip())) elif flag=='--service-url': service = arg wgetService = arg elif flag in ['-t', '--free-text']: freetext = arg elif flag=='--type': try: objtype = typeCode[arg] except: raise RuntimeError("Invalid return type: %s"%arg) elif flag in ['-v', '--verbose']: verbose = True # If a wget script is requested: # - Object type is File # - fields are url # - limit is min(1000, limit) if wgetScript: objtype = FILE fields = ['url'] userLimit = min(userLimit, WGET_MAX_RECORDS) countOnly = False allFacets = False facetValues = None query = formulateQuery(facets, fields, format, freetext, objtype, wgetService, offset, userLimit, facetValues=facetValues) if verbose: print >>sys.stderr, 'Query: ', query downloadResult(query, outpath, outpathIsStdout) return # XML output if format=='xml': userLimit = min(userLimit, DEFAULT_CHUNKSIZE) query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, userLimit, facetValues=facetValues) if verbose: print >>sys.stderr, 'Query: ', query downloadResult(query, outpath, outpathIsStdout) return # If returning id only, use wide format if fields==['id']: format = 'wide' includeId = True # For facet value queries, use wide format. if facetValues is not None: format = 'wide' # While remaining data: fullResults = [] numFound = 0 moredata = True nread = 0 nleft = userLimit chunksize = DEFAULT_CHUNKSIZE while moredata: # Formulate a query if not (countOnly or facetValues is not None): limit = min(nleft, chunksize) else: limit = 0 query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues) if verbose: print >>sys.stderr, 'Query: ', query # Read a chunk chunk = readChunk(service, query) # Parse the response. For facet value searches, parse the response trailer if facetValues is None: scoreInFields = ('score' in fields) results, numFound, numResults = parseResponse(chunk, includeId, scoreInFields) fullResults.extend(results) else: numResults = 0 if allFacets: fullResults, numFound = parseHeader(chunk) else: fullResults, numFound = parseTrailer(chunk, facetValues, includeId) # More data if some results were found and the number of records read < total nread += numResults nleft -= limit moredata = (numResults>0) and (nread<min(numFound, userLimit)) offset += limit # Output the results if not (countOnly or facetValues is not None): outputResults(fullResults, format, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath) elif facetValues is not None: for valueList, head in zip(fullResults[0], fullResults[1]): if allFacets: header = (head,) else: header=(head,'count') outputFacetResults(valueList, header, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath) else: print numFound if not outpathIsStdout: outpath.close()