def processIterator(command, commandArgs, filefilt=None, offline=False): """Create an iterator from an external process. Returns an iterator that returns (path, size) at each iteration. command Command string to execute the process - for example, "/some/python/bin/hsils.py". The process must write to stdout, a blank-separated "path size" on each line. commandArgs String arguments to the process. filefilt A regular expression as defined in the Python re module. Each file returned has basename matching the filter. offline Boolean, if True don't try to stat files. """ try: f = subprocess.Popen(command+" "+commandArgs, shell=True, stdout=subprocess.PIPE).stdout except: error("Error running command '%s %s', check configuration option 'offline_lister_executable'."%(command, commandArgs)) raise for path, size in filelistIterator_1(f, filefilt, offline=offline): yield (path, size) f.close() return
def catalog_thredds( self, parent ): from esgcet.publish import generateThredds from tkMessageBox import showerror dialog_icon = tkFileDialog.SaveAs(master=self.parent.control_frame2, filetypes=[("THREDDS", "*.thredds", "THREDDS XML"), ("XML", "*.xml", "THREDDS XML")], title = 'File Open Selection') dirfilename=dialog_icon.show(initialdir=os.getcwd()) if len(dirfilename)==0: return dir = dirfilename[:dirfilename.rfind('/')] filename = dirfilename[dirfilename.rfind('/')+1:] name = filename[:filename.rfind('.')].strip() # Check for directory and filename error if dirfilename in [(), '']: showerror("ESGPublishError", "There was an error in the selected directory and specified filename." ) return if name in [(), '']: showerror("ESGPublishError", "There was an error in the specified filename." ) return if os.access(dir, os.W_OK) != True: showerror("ESGPublishError", "You do not have write permission for the selected directory." ) return # Generate a THREDDS configuration file threddsOutputPath = dirfilename threddsOutput = open(threddsOutputPath, "w") try: selected_page = self.parent.parent.main_frame.selected_top_page datasetName = self.parent.parent.datasetName generateThredds(datasetName, self.parent.parent.Session, threddsOutput, self.parent.parent.handlerDictionary[datasetName]) except Exception, inst: error(traceback.format_exc()) showerror("ESGPublishError", inst )
def processIterator(command, commandArgs, filefilt=None, offline=False): """Create an iterator from an external process. Returns an iterator that returns (path, size) at each iteration. command Command string to execute the process - for example, "/some/python/bin/hsils.py". The process must write to stdout, a blank-separated "path size" on each line. commandArgs String arguments to the process. filefilt A regular expression as defined in the Python re module. Each file returned has basename matching the filter. offline Boolean, if True don't try to stat files. """ try: f = subprocess.Popen(command + " " + commandArgs, shell=True, stdout=subprocess.PIPE).stdout except: error( "Error running command '%s %s', check configuration option 'offline_lister_executable'." % (command, commandArgs)) raise for path, size in filelistIterator_1(f, filefilt, offline=offline): yield (path, size) f.close() return
def checksum(path, client): """ Calculate a file checksum. Returns the String checksum. path String pathname. client String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path. """ if not os.path.exists(path): raise ESGPublishError("No such file: %s"%path) command = "%s %s"%(client, path) info("Running: %s"%command) try: f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout except: error("Error running command '%s %s', check configuration option 'checksum'."%command) lines = f.readlines() csum = lines[0].split()[0] return csum
def checksum(path, client): """ Calculate a file checksum. Returns the String checksum. path String pathname. client String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path. """ if not os.path.exists(path): raise ESGPublishError("No such file: %s" % path) command = "%s %s" % (client, path) info("Running: %s" % command) try: f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout except: error( "Error running command '%s %s', check configuration option 'checksum'." % command) lines = f.readlines() csum = lines[0].split()[0] return csum
def calc_checksum(filepath, checksum_client): csum = None if os.path.exists(filepath): command = "%s %s"%(checksum_client, filepath) info("Running: %s"%command) try: f = os.popen(command).read() csum = f.split(' ')[0] except: error("Error running command '%s', check configuration option 'checksum'."%command) return filepath, csum
def loadEntryPoints(self): """ Get the entry points for the entry point group associated with this registry, and build an entry point dictionary. """ optionDict = {} distPlugins = { } # distPlugins: entry_point_distribution => distribution_dict # where distribution_dict: entry_point_name => handler_class for ep in iter_entry_points(self.entryPointGroup): if distPlugins.has_key(ep.dist): distPlugins[ep.dist][ep.name] = ep else: distPlugins[ep.dist] = {ep.name: ep} for dist, v in distPlugins.items(): if v.has_key(HANDLER_NAME_ENTRY_POINT): if v.has_key(HANDLER_ENTRY_POINT): handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[ handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)) else: optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True) else: warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT)) elif v.has_key(HANDLER_DICT_ENTRY_POINT): handlerDict = v[HANDLER_DICT_ENTRY_POINT].load() for handlerName, handlerClassName in handlerDict.items(): if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[ handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)) else: optionDict[handlerName] = (handlerClassName, dist, False) return optionDict
def pollDatasetPublicationStatus(datasetName, Session, service=None): """ Get the current dataset publication status by polling the gateway. Returns the current dataset publication status. datasetNames A list of string dataset names. Session A database Session. service Web service proxy instance. If None, the service is created. """ session = Session() dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: messaging.error("Dataset not found: %s" % datasetName) session.close() return PUBLISH_FAILED_EVENT status = dset.get_publication_status() if status != START_PUBLISH_DATASET_EVENT: session.close() return status if service is None: config = getConfig() serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT', 'hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) try: statusObj = PublicationStatus(dset.status_id, service) except socket.error, e: raise ESGPublishError( "Socket error: %s\nIs the proxy certificate %s valid?" % ( ` e `, service._cert_file))
def catalog_thredds(self, parent): from esgcet.publish import generateThredds from tkMessageBox import showerror dialog_icon = tkFileDialog.SaveAs(master=self.parent.control_frame2, filetypes=[("THREDDS", "*.thredds", "THREDDS XML"), ("XML", "*.xml", "THREDDS XML")], title='File Open Selection') dirfilename = dialog_icon.show(initialdir=os.getcwd()) if len(dirfilename) == 0: return dir = dirfilename[:dirfilename.rfind('/')] filename = dirfilename[dirfilename.rfind('/') + 1:] name = filename[:filename.rfind('.')].strip() # Check for directory and filename error if dirfilename in [(), '']: showerror( "ESGPublishError", "There was an error in the selected directory and specified filename." ) return if name in [(), '']: showerror("ESGPublishError", "There was an error in the specified filename.") return if os.access(dir, os.W_OK) != True: showerror( "ESGPublishError", "You do not have write permission for the selected directory.") return # Generate a THREDDS configuration file threddsOutputPath = dirfilename threddsOutput = open(threddsOutputPath, "w") try: selected_page = self.parent.parent.main_frame.selected_top_page datasetName = self.parent.parent.datasetName generateThredds(datasetName, self.parent.parent.Session, threddsOutput, self.parent.parent.handlerDictionary[datasetName]) except Exception, inst: error(traceback.format_exc()) showerror("ESGPublishError", inst)
def loadEntryPoints(self): """ Get the entry points for the entry point group associated with this registry, and build an entry point dictionary. """ optionDict = {} distPlugins = {} # distPlugins: entry_point_distribution => distribution_dict # where distribution_dict: entry_point_name => handler_class for ep in iter_entry_points(self.entryPointGroup): if distPlugins.has_key(ep.dist): distPlugins[ep.dist][ep.name] = ep else: distPlugins[ep.dist] = {ep.name: ep} for dist, v in distPlugins.items(): if v.has_key(HANDLER_NAME_ENTRY_POINT): if v.has_key(HANDLER_ENTRY_POINT): handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName) ) else: optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True) else: warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT)) elif v.has_key(HANDLER_DICT_ENTRY_POINT): handlerDict = v[HANDLER_DICT_ENTRY_POINT].load() for handlerName, handlerClassName in handlerDict.items(): if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName) ) else: optionDict[handlerName] = (handlerClassName, dist, False) return optionDict
def pollDatasetPublicationStatus(datasetName, Session, service=None): """ Get the current dataset publication status by polling the gateway. Returns the current dataset publication status. datasetNames A list of string dataset names. Session A database Session. service Web service proxy instance. If None, the service is created. """ session = Session() dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: messaging.error("Dataset not found: %s"%datasetName) session.close() return PUBLISH_FAILED_EVENT status = dset.get_publication_status() if status!=START_PUBLISH_DATASET_EVENT: session.close() return status if service is None: config = getConfig() serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) try: statusObj = PublicationStatus(dset.status_id, service) except socket.error, e: raise ESGPublishError("Socket error: %s\nIs the proxy certificate %s valid?"%(`e`, service._cert_file))
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None, pid_connector=None, project_config_section=None): """ Publish a list of datasets: - For each dataset, write a THREDDS catalog. - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server. - Reinitialize the LAS server. - Publish each dataset to the gateway. Returns a dictionary: (datasetName, version) => status datasetNames A list of (string_dataset_name, version) tuples. Session A database Session. parentId The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default), the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each dataset name is used as a key to lookup the respective parent id. If a string, the parent id is set to the string for all datasets being published. This function can be overridden in the project handler to implement a project-specific dataset hierarchy. handlerDictionary A dictionary mapping dataset_name => handler. publish Boolean flag: if true (the default), contact the gateway to publish this dataset. thredds Boolean flag: if true (the default), write the associated THREDDS catalog. las Boolean flag: if true (the default), write the associated LAS catalog. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. service String service name. If omitted, the first online/offline service in the configuration is used. perVariable Boolean, overrides ``variable_per_file`` config option. threddsCatalogDictionary If not None, just generate catalogs in strings, not the THREDDS directories, and set threddsCatalogDictionary[datasetId] = string_catalog reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. If None, defaults to value of thredds option. readFromCatalog Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. threddsCatalogDictionary must also be set. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. schema (Optional) String name of the schema to validate against, for RESTful publication calls. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) """ session = Session() resultDict = {} if readFromCatalog and threddsCatalogDictionary is None: raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.") # Get handlers for each dataset if handlerDictionary is None: handlers = {} for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) handler = getHandlerByName(dset.project, None, Session) handlers[datasetName] = handler else: handlers = handlerDictionary # reinitThredds defaults to the value of thredds option if reinitThredds is None: reinitThredds = thredds if thredds: for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() # If the dataset version is not the latest, publish as a per-time dataset without aggregation, # since the dataset variables only relate to the latest dataset version latestVersion = dset.getVersion() if versionno==-1: versionno=latestVersion if versionno!=latestVersion: if perVariable: messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion)) perVariable = False handler = handlers[datasetName] # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ... if threddsCatalogDictionary is None: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno, pid_connector=pid_connector) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... else if threddsCatalogDictionary is the catalog source: elif readFromCatalog: catalogString = threddsCatalogDictionary[(datasetName,versionno)] threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") messaging.info("Writing THREDDS catalog %s"%threddsOutputPath) threddsOutput.write(catalogString) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... otherwise write the catalog in a 'string file' else: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry threddsOutput = cStringIO.StringIO() generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno, pid_connector=pid_connector) threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue() threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: service_certs_location = getServiceCertsLoc() serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: if service.service_type == 'REST': error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName) continue datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds]: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # list of all target variables of a dataset dset_target_vars = set() # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: if filevar.is_target_variable: dset_target_vars.add(filevar.short_name) # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False, nodbwrite=False, pid_connector=None): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. pid_connector esgfpid.Connector object to register PIDs """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName, versionno = datasetNames[iloop] # Must specify version for replications if masterGateway: if not newVersion and versionno < 0: raise ESGPublishError( "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list." ) # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s" % datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate = False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName, versionno)]) == 0: warning("No files specified for dataset %s, version %d." % (datasetName, versionno)) continue firstFile = dmap[(datasetName, versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator( [direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator( [sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key( datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name info("Using project name = %s" % projectName) if prevProject is not None and projectName != prevProject: raise ESGPublishError( "Multiple projects found: %s, %s. Can only publish from one project" % (prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored' % name) else: context[name] = value # add dataset_version to context to allow version to be a mandatory field if versionno > -1: context['dataset_version'] = versionno elif newVersion is not None: context['dataset_version'] = newVersion # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset = None if testProgress1 is not None: testProgress1[1] = (100. / ct) * iloop if not offline: testProgress1[2] = (100. / ct) * iloop + (50. / ct) else: testProgress1[2] = (100. / ct) * iloop + (100. / ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, perVariable=perVariable, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, nodbwrite=nodbwrite, pid_connector=pid_connector, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100. / ct) * iloop + 50. / ct testProgress2[2] = (100. / ct) * (iloop + 1) if runAggregate and (not nodbwrite): aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later if (not nodbwrite): handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def dataset_page(self, dataset=None, Session=None, handler=None): if handler != None: try: self.parent.canvas.pack_forget() # Remove the white canvas except: pass self.field_list = {} validate = [] mandatory = [] options = {} values = {} #-------------------------------------------------------------------------------- # Generate a dataset page with dummy fields if no dataset was given #-------------------------------------------------------------------------------- if handler == None: return_fields = [ "Project", "Dataset name", "Model", "Experiment", "Run number", "Product", "Format" ] validate = [1, 2, 3, 4, 1, 2, 3, 4, 1] mandatory = [ True, False, True, False, True, False, True, False, True ] options = values = { "Project": None, "Dataset name": None, "Model": None, "Experiment": None, "Run number": None, "Product": None, "Format": None } else: #-------------------------------------------------------------------------------- # Retrieve the dataset fields and properties from the queryDataset command #-------------------------------------------------------------------------------- list_fields = getQueryFields(handler) properties = {'id': (1, dataset.get_id(Session))} for x in list_fields: if x is not "id": properties[x] = (2, "%") values, return_fields = queryDatasets(dataset.get_project(Session), handler, Session, properties) for x in return_fields: validate.append(handler.getFieldType(x)) options[x] = handler.getFieldOptions(x) mandatory.append(handler.isMandatory(x)) #-------------------------------------------------------------------------------- # View the dataset fields in the page #-------------------------------------------------------------------------------- for i in range(len(return_fields)): #print " ganz dataset test %s", return_fields[i] #print values[0][i] value = values[0][i] try: self.field_list[return_fields[i]] = show_field( self.parent, self.dataset_frame, return_fields[i].capitalize(), options[return_fields[i]], value, mandatory[i], validate[i]) except: field = return_fields[i] opts = options[field] mand = mandatory[i] valid = validate[i] error( "Error in show_fields: field=%s, options=%s, value=%s, mandatory=%s, validate=%s" % (field, ` opts `, ` value `, mand, valid)) error(traceback.format_exc()) raise Pmw.alignlabels( self.field_list.values()) # alien the labels for a clean look self.dataset_frame.pack(side='left', expand=1, fill='both', pady=2) #-------------------------------------------------------------------------------- # Create and pack the Group to display the message about mandatory fields #-------------------------------------------------------------------------------- txtFont = tkFont.Font(self.parent, family=pub_controls.text_font_type, size=pub_controls.text_font_size, weight=font_weight) g = Pmw.Group(self.dataset_sframe.interior(), tag_text='Mandatory Fields', tag_font=txtFont) g.pack(fill='x', padx=36) cw = Tkinter.Label( g.interior(), text= 'All fields that begin with an asterisk\n"*" and in blue, must have an entry.', font=txtFont) cw.pack(padx=2, pady=2, expand='yes', fill='both')
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. """ if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = nameDict[datasetName] if (not DELETE_AT_DATASET_LEVEL) and (dset is not None): for versionObj in versionObjs: try: eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName else: # Nothing in the node database, but still try to delete on the gateway if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface): datasetName = dset.name try: eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None): """ Publish a list of datasets: - For each dataset, write a THREDDS catalog. - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server. - Reinitialize the LAS server. - Publish each dataset to the gateway. Returns a dictionary: (datasetName, version) => status datasetNames A list of (string_dataset_name, version) tuples. Session A database Session. parentId The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default), the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each dataset name is used as a key to lookup the respective parent id. If a string, the parent id is set to the string for all datasets being published. This function can be overridden in the project handler to implement a project-specific dataset hierarchy. handlerDictionary A dictionary mapping dataset_name => handler. publish Boolean flag: if true (the default), contact the gateway to publish this dataset. thredds Boolean flag: if true (the default), write the associated THREDDS catalog. las Boolean flag: if true (the default), write the associated LAS catalog. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. service String service name. If omitted, the first online/offline service in the configuration is used. perVariable Boolean, overrides ``variable_per_file`` config option. threddsCatalogDictionary If not None, just generate catalogs in strings, not the THREDDS directories, and set threddsCatalogDictionary[datasetId] = string_catalog reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. If None, defaults to value of thredds option. readFromCatalog Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. threddsCatalogDictionary must also be set. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. schema (Optional) String name of the schema to validate against, for RESTful publication calls. """ session = Session() resultDict = {} if readFromCatalog and threddsCatalogDictionary is None: raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.") # Get handlers for each dataset if handlerDictionary is None: handlers = {} for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) handler = getHandlerByName(dset.project, None, Session) handlers[datasetName] = handler else: handlers = handlerDictionary # reinitThredds defaults to the value of thredds option if reinitThredds is None: reinitThredds = thredds if thredds: for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() # If the dataset version is not the latest, publish as a per-time dataset without aggregation, # since the dataset variables only relate to the latest dataset version latestVersion = dset.getVersion() if versionno==-1: versionno=latestVersion if versionno!=latestVersion: if perVariable: messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion)) perVariable = False handler = handlers[datasetName] # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ... if threddsCatalogDictionary is None: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... else if threddsCatalogDictionary is the catalog source: elif readFromCatalog: catalogString = threddsCatalogDictionary[(datasetName,versionno)] threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") messaging.info("Writing THREDDS catalog %s"%threddsOutputPath) threddsOutput.write(catalogString) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... otherwise write the catalog in a 'string file' else: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry threddsOutput = cStringIO.StringIO() generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno) threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue() threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. """ if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d" % gatewayOperation) deleteOnGateway = (gatewayOperation == DELETE) operation = (gatewayOperation != NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName, version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName( datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s" % datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT', 'hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName, version in datasetNames: isDataset, dset, versionObjs, isLatest = nameDict[datasetName] if (not DELETE_AT_DATASET_LEVEL) and (dset is not None): for versionObj in versionObjs: try: eventName, stateName = deleteGatewayDatasetVersion( versionObj.name, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[-2:], '\n'))) continue info(" Result: %s" % stateName) resultDict[datasetName] = eventName else: # Nothing in the node database, but still try to delete on the gateway if DELETE_AT_DATASET_LEVEL and (dset is not None) and ( not restInterface): datasetName = dset.name try: eventName, stateName = deleteGatewayDatasetVersion( datasetName, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[-2:], '\n'))) continue
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName,versionno = datasetNames[iloop] # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s"%datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate=False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName,versionno)])==0: warning("No files specified for dataset %s, version %d."%(datasetName,versionno)) continue firstFile = dmap[(datasetName,versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator([sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key(datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name info("Using project name = %s"%projectName) if prevProject is not None and projectName!=prevProject: raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored'%name) else: context[name] = value # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset=None if testProgress1 is not None: testProgress1[1] = (100./ct)*iloop if not offline: testProgress1[2] = (100./ct)*iloop + (50./ct) else: testProgress1[2] = (100./ct)*iloop + (100./ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100./ct)*iloop + 50./ct testProgress2[2] = (100./ct)*(iloop + 1) if runAggregate: aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def dataset_page( self, dataset = None, Session = None, handler = None ): if handler != None: try: self.parent.canvas.pack_forget() # Remove the white canvas except: pass self.field_list = {} validate = [] mandatory = [] options = {} values = {} #-------------------------------------------------------------------------------- # Generate a dataset page with dummy fields if no dataset was given #-------------------------------------------------------------------------------- if handler == None: return_fields = ["Project", "Dataset name", "Model", "Experiment", "Run number", "Product", "Format"] validate = [1,2,3,4,1,2,3,4,1] mandatory = [True,False,True,False,True,False,True,False,True] options = values = {"Project":None, "Dataset name":None, "Model":None, "Experiment":None, "Run number":None, "Product":None, "Format":None} else: #-------------------------------------------------------------------------------- # Retrieve the dataset fields and properties from the queryDataset command #-------------------------------------------------------------------------------- list_fields = getQueryFields( handler ) properties = {'id':(1, dataset.get_id( Session ))} for x in list_fields: if x is not "id": properties[ x ] = (2, "%") values, return_fields = queryDatasets(dataset.get_project( Session ), handler, Session, properties) for x in return_fields: validate.append( handler.getFieldType( x ) ) options[ x ] = handler.getFieldOptions( x ) mandatory.append( handler.isMandatory( x ) ) #-------------------------------------------------------------------------------- # View the dataset fields in the page #-------------------------------------------------------------------------------- for i in range(len(return_fields)): #print " ganz dataset test %s", return_fields[i] #print values[0][i] value = values[0][i] try: self.field_list[return_fields[i]] = show_field( self.parent, self.dataset_frame, return_fields[i].capitalize(), options[ return_fields[i] ], value, mandatory[i], validate[i] ) except: field = return_fields[i] opts = options[field] mand = mandatory[i] valid = validate[i] error("Error in show_fields: field=%s, options=%s, value=%s, mandatory=%s, validate=%s"%(field, `opts`, `value`, mand, valid)) error(traceback.format_exc()) raise Pmw.alignlabels (self.field_list.values()) # alien the labels for a clean look self.dataset_frame.pack(side = 'left', expand=1, fill='both' , pady = 2) #-------------------------------------------------------------------------------- # Create and pack the Group to display the message about mandatory fields #-------------------------------------------------------------------------------- txtFont=tkFont.Font(self.parent, family = pub_controls.text_font_type, size=pub_controls.text_font_size, weight=font_weight) g = Pmw.Group(self.dataset_sframe.interior(), tag_text='Mandatory Fields', tag_font=txtFont ) g.pack(fill = 'x', padx = 36) cw = Tkinter.Label(g.interior(), text = 'All fields that begin with an asterisk\n"*" and in blue, must have an entry.', font = txtFont ) cw.pack(padx = 2, pady = 2, expand='yes', fill='both')