def call_sessionmaker( root ): from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from esgcet.config import loadConfig, initLogging, registerHandlers # init_file = "../scripts/esg.ini" init_file = None # Load installed init file echoSql = True # Load the configuration and set up a database connection config = loadConfig(init_file) root.engine = create_engine(config.getdburl('extract'), echo=root.echoSql, pool_recycle=3600) initLogging('extract', override_sa=root.engine) Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() root.config = config root.Session = Session root.projectName = None root.firstFile = None root.dmap = None root.extraFields = None root.directoryMap = None root.datasetMapfile = None root.filefilt = None
def call_sessionmaker(root): from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from esgcet.config import loadConfig, initLogging, registerHandlers # init_file = "../scripts/esg.ini" init_file = None # Load installed init file echoSql = True # Load the configuration and set up a database connection config = loadConfig(init_file) root.engine = create_engine(config.getdburl('extract'), echo=root.echoSql, pool_recycle=3600) initLogging('extract', override_sa=root.engine) Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() root.config = config root.Session = Session root.projectName = None root.firstFile = None root.dmap = None root.extraFields = None root.directoryMap = None root.datasetMapfile = None root.filefilt = None
def publishCatalogs(threddsCatalogDictionary, parentDatasetIdDictionary, thredds=True, las=True, publish=True): """ Add one or more THREDDS catalogs to the THREDDS catalog directory, reinitialize THREDDS, and publish the catalogs to the gateway. Returns a dictionary mapping (datasetName, version) => status, as returned from ``publish.publishDatasetList``. If ``publish`` is False, the return dictionary is empty. threddsCatalogDictionary Dictionary of THREDDS catalogs, as returned from ``generateReplicaThreddsCatalog``. The dictionary maps datasetId => String THREDDS catalog. parentDatasetIdDictionary Dictionary mapping datasetId => parent dataset identifier in the gateway hierarchy. thredds=True Boolean flag. If True, copy the catalogs to the THREDDS catalog directory and reinitialize the TDS server. las=True Boolean flag. If True, reinitialize the LAS server. publish=True Boolean flag. If True, publish the catalog(s) to the gateway. """ # Load the configuration and set up a database connection config, Session = initdb() # Register project handlers registerHandlers() datasetNames = threddsCatalogDictionary.keys() result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parentDatasetIdDictionary, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=None, readFromCatalog=True) return result
def load_configuration(parent): import os import pub_controls from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler from sqlalchemy.orm import sessionmaker from esgcet.publish import multiDirectoryIterator, datasetMapIterator offline = parent.offline firstFile = parent.firstFile projectName = parent.projectName config = parent.config Session = parent.Session dmap = parent.dmap datasetNames = parent.datasetNames datasetMapfile = parent.datasetMapfile for datasetName in datasetNames: # Get a file iterator and sample file if datasetMapfile is not None: firstFile = dmap[datasetName][0][0] fileiter = datasetMapIterator(dmap, datasetName) else: direcTuples = parent.directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) fileiter = multiDirectoryIterator( [direc for direc, sampfile in direcTuples], parent.filefilt) # Register project handlers registerHandlers() # If the project is not specified, try to read it from the first file validate = True if projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=validate, offline=offline) else: handler = getHandler(firstFile, Session, validate=validate) parent.handler = handler # View the collection of datasets tab_name = "Collection %i" % parent.top_ct parent.ntk.new_page(parent, tab_name)
def publishCatalogs(threddsCatalogDictionary, parentDatasetIdDictionary, thredds=True, las=True, publish=True): """ Add one or more THREDDS catalogs to the THREDDS catalog directory, reinitialize THREDDS, and publish the catalogs to the gateway. Returns a dictionary mapping (datasetName, version) => status, as returned from ``publish.publishDatasetList``. If ``publish`` is False, the return dictionary is empty. threddsCatalogDictionary Dictionary of THREDDS catalogs, as returned from ``generateReplicaThreddsCatalog``. The dictionary maps datasetId => String THREDDS catalog. parentDatasetIdDictionary Dictionary mapping datasetId => parent dataset identifier in the gateway hierarchy. thredds=True Boolean flag. If True, copy the catalogs to the THREDDS catalog directory and reinitialize the TDS server. las=True Boolean flag. If True, reinitialize the LAS server. publish=True Boolean flag. If True, publish the catalog(s) to the gateway. """ # Load the configuration and set up a database connection config, Session = initdb() # Register project handlers registerHandlers() datasetNames = threddsCatalogDictionary.keys() result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parentDatasetIdDictionary, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=None, readFromCatalog=True) return result
def load_configuration( parent ): import os import pub_controls from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler from sqlalchemy.orm import sessionmaker from esgcet.publish import multiDirectoryIterator, datasetMapIterator offline = parent.offline firstFile = parent.firstFile projectName = parent.projectName config = parent.config Session = parent.Session dmap = parent.dmap datasetNames = parent.datasetNames datasetMapfile = parent.datasetMapfile for datasetName in datasetNames: # Get a file iterator and sample file if datasetMapfile is not None: firstFile = dmap[datasetName][0][0] fileiter = datasetMapIterator(dmap, datasetName) else: direcTuples = parent.directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples],parent.filefilt) # Register project handlers registerHandlers() # If the project is not specified, try to read it from the first file validate = True if projectName is not None: handler = getHandlerByName(projectName,firstFile,Session,validate=validate,offline=offline) else: handler = getHandler(firstFile, Session, validate=validate) parent.handler = handler # View the collection of datasets tab_name= "Collection %i" % parent.top_ct parent.ntk.new_page( parent, tab_name )
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", '.*\.nc$') init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError( "Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError( "Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog) return result
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError('No directory specified') output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, 'a') datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", '.*\.nc$') init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, 'w') else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % ( csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if (appendMap is None) or (not appendMap.has_key(datasetId)) or ( (filepath, "%d" % size) not in appendMap[datasetId]): print >> output, "%s | %s | %d | %s" % ( datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError( "Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ( (filepath, "%d" % size) not in appendMap[dsetName]): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", ".*\.nc$") init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError("Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName ) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName ) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets( projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, ) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog, ) return result
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError("No directory specified") output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, "a") datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", ".*\.nc$") init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, "w") else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get("DEFAULT", "checksum", default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if ( (appendMap is None) or (not appendMap.has_key(datasetId)) or ((filepath, "%d" % size) not in appendMap[datasetId]) ): print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if ( (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d" % size) not in appendMap[dsetName]) ): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def main(argv): try: args, lastargs = getopt.getopt(argv, "hi:", [ 'database-delete', 'database-only', 'echo-sql', 'map=', 'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index', 'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds', 'use-list=' ]) except getopt.error: print sys.exc_value return deleteAll = False datasetMap = None deleteDset = False unpublishOnGateway = False echoSql = False init_file = None gatewayOp = DELETE las = False log_filename = None republish = True restApi = None thredds = True syncThredds = False useList = False threddsReinit = True for flag, arg in args: if flag == '--database-delete': deleteDset = True elif flag == '--database-only': gatewayOp = NO_OPERATION thredds = False deleteDset = True elif flag == '--echo-sql': echoSql = True elif flag in ['-h', '--help']: return elif flag == '-i': init_file = arg elif flag == '--map': datasetMap = readDatasetMap(arg) elif flag == '--skip-gateway': gatewayOp = NO_OPERATION elif flag == '--skip-index': gatewayOp = NO_OPERATION elif flag == '--las': las = True elif flag == '--log': log_filename = arg elif flag == '--no-republish': republish = False elif flag == '--no-thredds-reinit': threddsReinit = False elif flag == '--rest-api': restApi = True elif flag == '--skip-thredds': thredds = False elif flag == '--sync-thredds': syncThredds = True elif flag == '--use-list': useList = True useListPath = arg if gatewayOp != NO_OPERATION and unpublishOnGateway: gatewayOp = UNPUBLISH # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) if config is None: raise ESGPublishError("No configuration file found.") threddsRoot = config.get('DEFAULT', 'thredds_root') # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) if datasetMap is None: if not useList: datasetNames = [parseDatasetVersionId(item) for item in lastargs] else: if useListPath == '-': namelist = sys.stdin else: namelist = open(useListPath) datasetNames = [] for line in namelist.readlines(): versionId = parseDatasetVersionId(line.strip()) datasetNames.append(versionId) else: datasetNames = datasetMap.keys() datasetNames.sort() result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll, republish=republish, reinitThredds=threddsReinit, restInterface=restApi) # Republish previous versions as needed. This will happen if the latest version # was deleted from the database, and is not # the only version. In this case the previous version will be rescanned to generate the aggregations. if republish: statusDict, republishList = result if len(republishList) > 0: # Register project handlers. registerHandlers() info("Republishing modified datasets:") republishDatasetNames = [ generateDatasetVersionId(dsetTuple) for dsetTuple in republishList ] dmap, offline = queryDatasetMap(republishDatasetNames, Session) datasetNames = dmap.keys() datasets = iterateOverDatasets(None, dmap, None, republishList, Session, "time", UPDATE_OP, None, {}, offline, {}, forceAggregate=True) republishOp = (gatewayOp != NO_OPERATION ) # Don't republish if skipping the gateway op result = publishDatasetList(datasetNames, Session, publish=republishOp, thredds=thredds) # Synchronize database and THREDDS catalogs if syncThredds: threddsRoot = config.get('DEFAULT', 'thredds_root') # Make a dictionary of catalogs from the database session = Session() subcatalogs = session.query(Catalog).select_from( join(Catalog, Dataset, Catalog.dataset_name == Dataset.name)).all() catdict = {} for catalog in subcatalogs: location = os.path.join(threddsRoot, catalog.location) catdict[location] = 1 session.close() # Scan all XML files in the threddsroot os.path.walk(threddsRoot, cleanupCatalogs, catdict)
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line