def readContext(self, cdfile): "Get a dictionary of key/value pairs from an open file." f = cdfile.file result = {} if hasattr(f, 'title'): result['title'] = f.title if hasattr(f, 'Conventions'): result['Conventions'] = f.Conventions if hasattr(f, 'source'): result['source'] = f.source if hasattr(f, 'history'): result['history'] = f.history config = getConfig() projectSection = 'project:' + self.name config_key = "extract_global_attrs" if config.has_option(projectSection, config_key): cdms_file = cdms_open(self.path) for key in splitLine(config.get(projectSection, config_key), ','): # check for mapped keys if ':' in key: parts = key.split(':') value = cdms_file.__getattribute__(parts[0]) result[parts[1]] = value else: result[key] = cdms_file.__getattribute__(key) return result
def __init__(self, cdf, path): if (cdf is None): self.noncd = True self.file = {} self.path = path else: # load config and set it based on config = getConfig() projectSection = 'project:input4mips' variables_none = config.get(projectSection, "variables_none", default="false") if variables_none == "false": self.noncd = False CdunifFormatHandler.__init__(self, cdf, path) elif variables_none == "attr": CdunifFormatHandler.__init__(self, cdf, path) self.attr_only = True self.noncd = True else: # assume "true" self.noncd = True self.file = {} self.path = path self.attr_only = False
def __init__(self, cdf, path): self.attr_only = False if (cdf is None ): self.noncd = True self.file = {} self.path = path else: # load config and set it based on config = getConfig() projectSection = 'project:dream' variables_none = config.get(projectSection, "variables_none", default="false") if variables_none == "false": self.noncd = False CdunifFormatHandler.__init__(self, cdf, path) elif variables_none == "attr": CdunifFormatHandler.__init__(self, cdf, path) self.attr_only = True self.noncd = True else: # assume "true" self.noncd = True self.file = {} self.path = path self.attr_only = False
def getDatasetIdFields(self): """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``. """ config = getConfig() section = 'project:'+self.name dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True)) idfields = [re.findall(_patpat, format) for format in dataset_id_formats] return idfields, dataset_id_formats
def generateDatasetId(self, option, idfields, groupdict, multiformat=None): """ Generate a dataset ID from a config file option. Returns the ID. option Name of the dataset ID option idfields List of string fields needed to generate the ID, or a list of lists if multiformat is not None. groupdict Dictionary to generate the ID from. multiformat Set for multi-field formats, such as dataset_id. """ config = getConfig() section = 'project:' + self.name mapdict = self.getMaps() keys = groupdict.keys() foundValue = False if multiformat is not None: for fieldlist, format in zip(idfields, multiformat): try: result = self.generateDatasetId_1(option, fieldlist, groupdict, config, section, mapdict, keys, format=format) except: continue else: foundValue = True break else: try: result = self.generateDatasetId_1(option, idfields, groupdict, config, section, mapdict, keys) except: pass else: foundValue = True if not foundValue: raise ESGPublishError( "Cannot generate a value for option %s, please specify the dataset id explicitly." % option) return result
def initializeFields(self, Session): """Initialize field names and options based on the configuration file.""" from esgcet.model import Model, Experiment config = getConfig() projectSection = "project:" + self.name categoryOption = config.get(projectSection, "categories") categorySpecs = splitRecord(categoryOption) for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs: categoryType = getCategoryType(categoryTypeS) isMandatory = getBoolean(isMandatoryS) isThreddsProperty = getBoolean(isThreddsPropertyS) displayOrder = string.atoi(displayOrderS) self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder) categoryDefaultsOption = config.get(projectSection, "category_defaults", default=None, raw=True) if categoryDefaultsOption is not None: categoryDefaultsSpecs = splitRecord(categoryDefaultsOption) for category, categoryDefault in categoryDefaultsSpecs: self.categoryDefaults[category] = categoryDefault session = Session() # Find any new experiments. This allows experiments to be added to the config file without # running esginitialize. if self.fieldNames.has_key("experiment") and self.fieldNames["experiment"][WIDGET_TYPE] == ENUM: initializeExperiments(config, self.name, session) for category in self.getFieldNames(): # At the moment some fields are predefined if category == "project": projects = splitRecord(config.get(projectSection, "project_options", default="")) self.validValues["project"] = [x[0] for x in projects] elif category == "model": models = session.query(Model).filter_by(project=self.name).all() self.validValues["model"] = [x.name for x in models] elif category == "experiment": experiments = session.query(Experiment).filter_by(project=self.name).all() self.validValues["experiment"] = [x.name for x in experiments] elif category == "creator": creators = splitRecord(config.get(projectSection, "creator_options", default="")) self.validValues["creator"] = [x[0] for x in creators] self.validMaps["creator"] = genMap(creators) elif category == "publisher": publishers = splitRecord(config.get(projectSection, "publisher_options", default="")) self.validValues["publisher"] = [x[0] for x in publishers] self.validMaps["publisher"] = genMap(publishers) else: categoryType = self.getFieldType(category) if categoryType == ENUM: option = category + "_options" self.validValues[category] = splitLine(config.get(projectSection, option), ",") self.context[category] = "" session.close()
def _colors_are_disabled(self): if self._disable_colors == None: config = getConfig() if config: self._disable_colors = \ config.getboolean('DEFAULT', 'disable_colors', default=False) else: return False # allow colors until config is loaded return self._disable_colors
def getHessianServiceURL(project_config_section=None): """Get the configured value of hessian_service_url""" config = getConfig() serviceURL = None if project_config_section and config.has_section(project_config_section): serviceURL = config.get(project_config_section, 'hessian_service_url', default=None) if not serviceURL: serviceURL = config.get('DEFAULT', 'hessian_service_url') return serviceURL
def getDatasetIdFields(self): """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``. """ config = getConfig() section = 'project:' + self.name dataset_id_formats = splitLine( config.get(section, 'dataset_id', raw=True)) idfields = [ re.findall(_patpat, format) for format in dataset_id_formats ] return idfields, dataset_id_formats
def getRestServiceURL(): """Get the configured value of rest_service_url. If not set, derive host from hessian_service_url and use '/esg-search/ws' as the path. """ config = getConfig() serviceURL = config.get('DEFAULT', 'rest_service_url', default=None) if serviceURL is None: hessianServiceURL = config.get('DEFAULT', 'hessian_service_url') host = urlparse.urlparse(hessianServiceURL).netloc serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', '')) return serviceURL
def validateFile(self, fileobj): """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.""" config = getConfig() projectSection = 'project:'+self.name if config.has_option(projectSection, 'min_cmor_version'): min_cmor_version = config.get(projectSection, "min_cmor_version", default="0.0.0") file_cmor_version = fileobj.getAttribute('cmor_version', None) if not compareLibVersions(min_cmor_version, file_cmor_version): raise ESGInvalidMetadataFormat("file " + self.path + " cmor version = " + file_cmor_version + ", running checks - minimum = " + min_cmor_version )
def getRemoteMetadataService(serviceUrl=None): """Get the remote metadata service. Returns the service object. """ config = getConfig() if serviceUrl is None: remoteMetadataServiceUrl = config.get('DEFAULT', 'hessian_service_remote_metadata_url') else: remoteMetadataServiceUrl = serviceUrl serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(remoteMetadataServiceUrl, 80, debug=serviceDebug) return service
def generateDatasetId(self, option, idfields, groupdict, multiformat=None): """ Generate a dataset ID from a config file option. Returns the ID. option Name of the dataset ID option idfields List of string fields needed to generate the ID, or a list of lists if multiformat is not None. groupdict Dictionary to generate the ID from. multiformat Set for multi-field formats, such as dataset_id. """ config = getConfig() section = "project:" + self.name mapdict = self.getMaps() keys = groupdict.keys() foundValue = False if multiformat is not None: for fieldlist, format in zip(idfields, multiformat): try: result = self.generateDatasetId_1( option, fieldlist, groupdict, config, section, mapdict, keys, format=format ) except: continue else: foundValue = True break else: try: result = self.generateDatasetId_1(option, idfields, groupdict, config, section, mapdict, keys) except: pass else: foundValue = True if not foundValue: raise ESGPublishError( "Cannot generate a value for option %s, please specify the dataset id explicitly." % option ) return result
def getHessianServiceURL(): """Get the configured value of hessian_service_url""" config = getConfig() serviceURL = config.get('DEFAULT', 'hessian_service_url') gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None) if gatewayServiceRoot is not None: dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL) dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot) if serviceHost!=envServiceHost: warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot)) return serviceURL
def getRemoteMetadataService(serviceUrl=None): """Get the remote metadata service. Returns the service object. """ config = getConfig() if serviceUrl is None: remoteMetadataServiceUrl = config.get( 'DEFAULT', 'hessian_service_remote_metadata_url') else: remoteMetadataServiceUrl = serviceUrl serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(remoteMetadataServiceUrl, 80, debug=serviceDebug) return service
def pollDatasetPublicationStatus(datasetName, Session, service=None): """ Get the current dataset publication status by polling the gateway. Returns the current dataset publication status. datasetNames A list of string dataset names. Session A database Session. service Web service proxy instance. If None, the service is created. """ session = Session() dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: messaging.error("Dataset not found: %s" % datasetName) session.close() return PUBLISH_FAILED_EVENT status = dset.get_publication_status() if status != START_PUBLISH_DATASET_EVENT: session.close() return status if service is None: config = getConfig() serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT', 'hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) try: statusObj = PublicationStatus(dset.status_id, service) except socket.error, e: raise ESGPublishError( "Socket error: %s\nIs the proxy certificate %s valid?" % ( ` e `, service._cert_file))
def getRestServiceURL(project_config_section=None): """Get the configured value of rest_service_url. If not set, derive host from hessian_service_url and use '/esg-search/ws' as the path. """ config = getConfig() hessianServiceURL = None # get project specific hessian service url if serviceURL is None: if project_config_section and config.has_section(project_config_section): hessianServiceURL = config.get(project_config_section, 'hessian_service_url', default=None) if not hessianServiceURL: hessianServiceURL = config.get('DEFAULT', 'hessian_service_url') host = urlparse.urlparse(hessianServiceURL).netloc serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', '')) return serviceURL
def validateFile(self, fileobj): """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.""" config = getConfig() projectSection = 'project:' + self.name if config.has_option(projectSection, 'min_cmor_version'): min_cmor_version = config.get(projectSection, "min_cmor_version", default="0.0.0") file_cmor_version = fileobj.getAttribute('cmor_version', None) if not compareLibVersions(min_cmor_version, file_cmor_version): raise ESGInvalidMetadataFormat( "file " + self.path + " cmor version = " + file_cmor_version + ", running checks - minimum = " + min_cmor_version)
def getRestServiceURL(project_config_section=None): """Get the configured value of rest_service_url. If not set, derive host from hessian_service_url and use '/esg-search/ws' as the path. """ config = getConfig() hessianServiceURL = None serviceURL = None # get project specific hessian service url if serviceURL is None: if project_config_section and config.has_section(project_config_section): hessianServiceURL = config.get(project_config_section, 'hessian_service_url', default=None) if not hessianServiceURL: hessianServiceURL = config.get('DEFAULT', 'hessian_service_url') host = urlparse.urlparse(hessianServiceURL).netloc serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', '')) return serviceURL
def reinitializeLAS(): """ Reinitialize the Live Access Server. This forces the catalogs to be reread. Returns the HTML string returned from the URL. """ config = getConfig() if config is None: raise ESGPublishError("No configuration file found.") lasReinitUrl = config.get("DEFAULT", "las_reinit_url") info("Reinitializing LAS server") try: reinitResult = readThreddsWithAuthentication(lasReinitUrl, config) except Exception, e: raise ESGPublishError("Error reinitializing the Live Access Server: %s" % e)
def getDirectoryFormatFilters(self): """Return a list of regular expression filters associated with the ``directory_format`` option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = "project:" + self.name directory_format = config.get(section, "directory_format", raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace("\.", "__ESCAPE_DOT__") pat3 = pat2.replace(".", r"\.") pat4 = pat3.replace("__ESCAPE_DOT__", r"\.") # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r"(?P<\1>[^/]*)", pat4) filter = "^" + pattern + "$" filters.append(filter) return filters
def reinitializeLAS(): """ Reinitialize the Live Access Server. This forces the catalogs to be reread. Returns the HTML string returned from the URL. """ config = getConfig() if config is None: raise ESGPublishError("No configuration file found.") lasReinitUrl = config.get('DEFAULT', 'las_reinit_url') info("Reinitializing LAS server") try: reinitResult = readThreddsWithAuthentication(lasReinitUrl, config) except Exception, e: raise ESGPublishError( "Error reinitializing the Live Access Server: %s" % e)
def __init__(self, cdf, path): if (cdf is None ): self.noncd = True self.file = {} self.path = path else: # load config and set it based on config = getConfig() projectSection = 'project:' variables_none = config.get(projectSection, "variables_none", default="false") if variables_none == "false": self.noncd = False CdunifFormatHandler.__init__(self, cdf, path) else: self.noncd = True self.file = {} self.path = path
def getFilters(self, option='directory_format'): """Return a list of regular expression filters associated with the option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = 'project:' + self.name directory_format = config.get(section, option, raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace('\.', '__ESCAPE_DOT__') pat3 = pat2.replace('.', r'\.') pat4 = pat3.replace('__ESCAPE_DOT__', r'\.') # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4) filter = '^' + pattern + '$' filters.append(filter) return filters
def getFilters(self, option='directory_format'): """Return a list of regular expression filters associated with the option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = 'project:'+self.name directory_format = config.get(section, option, raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace('\.','__ESCAPE_DOT__') pat3 = pat2.replace('.', r'\.') pat4 = pat3.replace('__ESCAPE_DOT__', r'\.') # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4) filter = '^'+pattern+'$' filters.append(filter) return filters
def pollDatasetPublicationStatus(datasetName, Session, service=None): """ Get the current dataset publication status by polling the gateway. Returns the current dataset publication status. datasetNames A list of string dataset names. Session A database Session. service Web service proxy instance. If None, the service is created. """ session = Session() dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: messaging.error("Dataset not found: %s"%datasetName) session.close() return PUBLISH_FAILED_EVENT status = dset.get_publication_status() if status!=START_PUBLISH_DATASET_EVENT: session.close() return status if service is None: config = getConfig() serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) try: statusObj = PublicationStatus(dset.status_id, service) except socket.error, e: raise ESGPublishError("Socket error: %s\nIs the proxy certificate %s valid?"%(`e`, service._cert_file))
def getServiceCertsLoc(): try: service_certs_location = getConfig().get( 'DEFAULT', 'hessian_service_certs_location') except: home = os.environ.get("HOME") if home is not None: service_certs_location = home + DEFAULT_CERTS_LOCATION_SUFFIX if service_certs_location is None: raise ESGPublishError( "hessian_service_certs_location needs to be set in esg.ini") if not os.path.exists(service_certs_location): raise ESGPublishError( "Error: " + service_certs_location + " does not exist. Please run myproxy-logon with -b to bootstrap the certificates, or set an alternate location using the hessian_service_certs_location setting in esg.ini" ) return service_certs_location
def getMaps(self): """Get a dictionary of maps from the project section. """ config = getConfig() section = 'project:'+self.name if self.mapdict is None: mapdict = {} projectMaps = splitLine(config.get(section, 'maps', default=""), ',') for option in projectMaps: if option=="": continue fromcat, tocat, projectMap = splitMap(config.get(section, option)) for to_index, field in enumerate(tocat): value = (fromcat, projectMap, to_index) if mapdict.has_key(field): mapdict[field].append(value) else: mapdict[field] = [value] self.mapdict = mapdict return self.mapdict
def generateNameFromContext(self, parameter, **extraParams): """ Generate a name from a config file parameter, relative to the current handler context. Mapped format strings are also resolved. Returns a string name. parameter The configuration file option, e.g., 'dataset_id' extraParams Extra options, added to the current context before resolving the name. On return self.context is not modified. """ tempcontext = {} tempcontext.update(self.context) tempcontext.update(extraParams) section = 'project:'+self.name config = getConfig() generatedName = self.generateNameFromContext_1(parameter, config, section, 1, **tempcontext) return generatedName
def initializeFields(self, Session): BasicHandler.initializeFields(self, Session) config = getConfig() projectSection = 'project:'+self.name # Enumerated value validation is case-insensitive lowerCaseValidValues = {} for field, valueList in self.validValues.items(): lowerCaseValidList = [] validDict = {} for value in valueList: if value is not None: lvalue = value.lower() else: lvalue = None lowerCaseValidList.append(lvalue) validDict[lvalue] = value lowerCaseValidValues[field] = lowerCaseValidList self.caseSensitiveValidValues[field] = validDict self.validValues = lowerCaseValidValues self.checkFilenames = config.getboolean(projectSection, 'thredds_check_file_names', default=True)
def getMaps(self): """Get a dictionary of maps from the project section. """ config = getConfig() section = 'project:' + self.name if self.mapdict is None: mapdict = {} projectMaps = splitLine(config.get(section, 'maps', default=""), ',') for option in projectMaps: if option == "": continue fromcat, tocat, projectMap = splitMap( config.get(section, option)) for to_index, field in enumerate(tocat): value = (fromcat, projectMap, to_index) if mapdict.has_key(field): mapdict[field].append(value) else: mapdict[field] = [value] self.mapdict = mapdict return self.mapdict
def generateNameFromContext(self, parameter, **extraParams): """ Generate a name from a config file parameter, relative to the current handler context. Mapped format strings are also resolved. Returns a string name. parameter The configuration file option, e.g., 'dataset_id' extraParams Extra options, added to the current context before resolving the name. On return self.context is not modified. """ tempcontext = {} tempcontext.update(self.context) tempcontext.update(extraParams) section = 'project:' + self.name config = getConfig() generatedName = self.generateNameFromContext_1(parameter, config, section, 1, **tempcontext) return generatedName
def readContext(self, cdfile): "Get a dictionary of key/value pairs from an open file." f = cdfile.file result = {} if hasattr(f, 'title'): result['title'] = f.title if hasattr(f, 'Conventions'): result['Conventions'] = f.Conventions if hasattr(f, 'source'): result['source'] = f.source if hasattr(f, 'history'): result['history'] = f.history config = getConfig() projectSection = 'project:' + self.name config_key = "extract_global_attrs" if config.has_option(projectSection, config_key): for key in splitLine(config.get(projectSection, config_key), ','): result[key] = cdfile.getAttribute(key, None) return result
def main(): """Uses the esg.ini file options: - thredds_file_services to get a Globus endpoint UUID - thredds_root to find a directory with THREDDS xml catalogs """ loadConfig(None) config = getConfig() if config is None: raise ESGPublishError('No configuration file found') # By default thredds_root is: /esg/content/thredds/esgcet thredds_root = config.get('DEFAULT', 'thredds_root') thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services') # parameters needed to re-harvest the THREDDS catalogs thredds_url = config.get('DEFAULT', 'thredds_url') hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile') hessian_service_url = config.get('DEFAULT', 'hessian_service_url') esgf_harvesting_service_url = hessian_service_url.replace('remote/secure/client-cert/hessian/publishingService','ws/harvest') thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..')) globus_base = None for service in thredds_file_services: if service[2] == 'Globus': globus_base = service[1] if globus_base is None: print 'No Globus file service specified in %s\n'\ 'Add Globus file service to the thredds_file_services variable in the form:\n'\ ' Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\ 'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI'] sys.exit(1) print '\n'\ 'ESGINI: %s\n'\ 'THREDDS root: %s\n'\ 'THREDDS url: %s\n'\ 'Globus service base: %s\n'\ 'ESGF harvesting service url: %s\n'\ 'X.509 user credential: %s\n'\ '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile) if not globus_base.endswith('/'): print 'Globus service base must end with "/". Set Globus service base correctly in\n'\ '%s end run the script again.' % os.environ['ESGINI'] sys.exit(1) print 'The script recursively goes through xml files in %s\n'\ 'looking for datasets that were published without Globus file service and adds\n'\ 'Globus access to the datasets. If a dataset was published with Globus file\n'\ 'service configured, the script skips such a dataset leaving a corresponding xml\n'\ 'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\ 'to harvest the updated xml files. Because Hessian service requires SSL\n'\ 'authentication, the X.509 certificate, %s,\n'\ 'should be valid and obtained by a user who has the publisher role in all\n'\ 'projects.\n'\ 'It is strongly advised that you make a copy of the entire %s\n'\ 'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up) while True: sys.stdout.write("Do you want to continue? [y/N]") line = sys.stdin.readline().rstrip() if line == '' or line == 'n' or line == 'N': sys.exit(0) if line == 'y' or line == 'Y': break process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
def main(): """Uses the esg.ini file options: - thredds_file_services to get a Globus endpoint UUID - thredds_root to find a directory with THREDDS xml catalogs """ loadConfig(None) config = getConfig() if config is None: raise ESGPublishError('No configuration file found') # By default thredds_root is: /esg/content/thredds/esgcet thredds_root = config.get('DEFAULT', 'thredds_root') thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services') # parameters needed to re-harvest the THREDDS catalogs thredds_url = config.get('DEFAULT', 'thredds_url') hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile') hessian_service_url = config.get('DEFAULT', 'hessian_service_url') esgf_harvesting_service_url = hessian_service_url.replace( 'remote/secure/client-cert/hessian/publishingService', 'ws/harvest') thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..')) globus_base = None for service in thredds_file_services: if service[2] == 'Globus': globus_base = service[1] if globus_base is None: print 'No Globus file service specified in %s\n'\ 'Add Globus file service to the thredds_file_services variable in the form:\n'\ ' Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\ 'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI'] sys.exit(1) print '\n'\ 'ESGINI: %s\n'\ 'THREDDS root: %s\n'\ 'THREDDS url: %s\n'\ 'Globus service base: %s\n'\ 'ESGF harvesting service url: %s\n'\ 'X.509 user credential: %s\n'\ '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile) if not globus_base.endswith('/'): print 'Globus service base must end with "/". Set Globus service base correctly in\n'\ '%s end run the script again.' % os.environ['ESGINI'] sys.exit(1) print 'The script recursively goes through xml files in %s\n'\ 'looking for datasets that were published without Globus file service and adds\n'\ 'Globus access to the datasets. If a dataset was published with Globus file\n'\ 'service configured, the script skips such a dataset leaving a corresponding xml\n'\ 'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\ 'to harvest the updated xml files. Because Hessian service requires SSL\n'\ 'authentication, the X.509 certificate, %s,\n'\ 'should be valid and obtained by a user who has the publisher role in all\n'\ 'projects.\n'\ 'It is strongly advised that you make a copy of the entire %s\n'\ 'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up) while True: sys.stdout.write("Do you want to continue? [y/N]") line = sys.stdin.readline().rstrip() if line == '' or line == 'n' or line == 'N': sys.exit(0) if line == 'y' or line == 'Y': break process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False, nodbwrite=False, pid_connector=None, test_publication=False, handlerExtraArgs={}, commitEvery=None): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. handlerExtraArgs={} A dictionary of extra keyword arguments to pass when instantiating the handler. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. pid_connector esgfpid.Connector object to register PIDs commitEvery Integer specifying how frequently to commit file info to database when scanning files """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName,versionno = datasetNames[iloop] # Must specify version for replications if masterGateway: if not newVersion and versionno < 0: raise ESGPublishError("Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list.") # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s"%datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate=False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName,versionno)])==0: warning("No files specified for dataset %s, version %d."%(datasetName,versionno)) continue firstFile = dmap[(datasetName,versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator([sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key(datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline, **handlerExtraArgs) else: handler = getHandler(firstFile, Session, validate=True, **handlerExtraArgs) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name info("Using project name = %s"%projectName) if prevProject is not None and projectName!=prevProject: raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored'%name) else: context[name] = value # add dataset_version to context to allow version to be a mandatory field if versionno > -1: context['dataset_version'] = versionno elif newVersion is not None: context['dataset_version'] = newVersion # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset=None if testProgress1 is not None: testProgress1[1] = (100./ct)*iloop if not offline: testProgress1[2] = (100./ct)*iloop + (50./ct) else: testProgress1[2] = (100./ct)*iloop + (100./ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, perVariable=perVariable, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, nodbwrite=nodbwrite, pid_connector=pid_connector, test_publication=test_publication, commitEvery=commitEvery, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate # Turn off aggregations if skip_aggregations is set # This applies even if forceAggregate is set to True elsewhere in the # code when republishing an earlier version of the dataset section = 'project:%s' % context.get('project') config = getConfig() skipAggregate = config.getboolean(section, 'skip_aggregations', False) if runAggregate and skipAggregate: runAggregate = False info("Skipping aggregations due to skip_aggregations config option") if testProgress2 is not None: testProgress2[1] = (100./ct)*iloop + 50./ct testProgress2[2] = (100./ct)*(iloop + 1) if runAggregate and (not nodbwrite): aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later if (not nodbwrite): handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def validateContext(self, context): """ Validate context values: - Mandatory values must be non-blank, and if enumerated have a valid value - If enumerated, non-mandatory values must be blank or have a valid value otherwise if enumerated the field must be either be blank or one of the valid values Raises ESGPublishError if a validation error occurs If the validate configuration option is set to False in the project section, validation always succeeds. """ if not self.validate: return for key in context.keys(): fieldType = self.getFieldType(key) # Ignore non-configured fields if fieldType is None: continue isenum = (fieldType == ENUM) if isenum: options = self.getFieldOptions(key) value = context[key] config = getConfig() project_section = 'project:%s' % self.name delimiter = config.get(project_section, key + "_delimiter", default="") if value in ['', None]: # if value not in default context, try to get it from key_pattern or *_map option = '%s_pattern' % key if config.has_option(project_section, option): value = config.get(project_section, option, False, context) context[key] = value elif config.has_option(project_section, 'maps'): for map_option in splitLine( config.get(project_section, 'maps', default=''), ','): from_keys, to_keys, value_dict = splitMap( config.get(project_section, map_option)) if key in to_keys: from_values = tuple(context[k] for k in from_keys) to_values = value_dict[from_values] value = to_values[to_keys.index(key)] context[key] = value if self.isMandatory(key): if value in ['', None]: if isenum: raise ESGInvalidMandatoryField( "Mandatory field '%s' not set, must be one of %s" % (key, ` options `)) else: raise ESGInvalidMandatoryField( "Mandatory field '%s' not set" % key) elif isenum and not self.compareEnumeratedValue( value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGInvalidMandatoryField( "Invalid value of mandatory field '%s': %s, must be one of %s" % (key, value, ` validOptions `)) elif isenum: # non-mandatory field options += ['', None] if not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGPublishError( "Invalid value of '%s': %s, must be one of %s" % (key, value, ` validOptions `))
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path if self.replica: debug("skipping PrePARE for replica (file %s)" % f) return # todo refactoring these could loaded upfront in the constructor config = getConfig() project_section = 'project:' + self.name project_config_section = 'config:' + self.name min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0") min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0") data_specs_version = config.get(project_config_section, "data_specs_version", default="master") cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH) try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: file_cmor_version = None debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f) passed_cmor = False if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!"'%(f, file_cmor_version)) passed_cmor = True try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError("File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError("File %s missing required variable_id global attribute" % f) # data_specs_version drives CMOR table fetching # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini") # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini try: file_data_specs_version = fileobj.getAttribute('data_specs_version', None) except Exception as e: raise ESGPublishError("File %s missing required data_specs_version global attribute"%f) if not compareLibVersions(min_ds_version, file_data_specs_version): raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version)) # at this point the file has the correct data specs version. # if also was CMORized and has the correct version tag, we can exit if passed_cmor: return if data_specs_version == "file": data_specs_version = file_data_specs_version checkAndUpdateRepo(cmor_table_path, data_specs_version) try: process = validator.checkCMIP6(cmor_table_path) if process is None: raise ESGPublishError("File %s failed the CV check - object create failure"%f) process.ControlVocab(f) except: raise ESGPublishError("File %s failed the CV check"%f)
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. """ if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = nameDict[datasetName] if (not DELETE_AT_DATASET_LEVEL) and (dset is not None): for versionObj in versionObjs: try: eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName else: # Nothing in the node database, but still try to delete on the gateway if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface): datasetName = dset.name try: eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue
def initializeFields(self, Session): """Initialize field names and options based on the configuration file.""" from esgcet.model import Model, Experiment config = getConfig() projectSection = 'project:' + self.name categoryOption = config.get(projectSection, 'categories') categorySpecs = splitRecord(categoryOption) for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs: categoryType = getCategoryType(categoryTypeS) isMandatory = getBoolean(isMandatoryS) isThreddsProperty = getBoolean(isThreddsPropertyS) displayOrder = string.atoi(displayOrderS) self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder) categoryDefaultsOption = config.get(projectSection, 'category_defaults', default=None, raw=True) if categoryDefaultsOption is not None: categoryDefaultsSpecs = splitRecord(categoryDefaultsOption) for category, categoryDefault in categoryDefaultsSpecs: self.categoryDefaults[category] = categoryDefault session = Session() # Find any new experiments. This allows experiments to be added to the config file without # running esginitialize. if self.fieldNames.has_key('experiment') and self.fieldNames[ 'experiment'][WIDGET_TYPE] == ENUM: initializeExperiments(config, self.name, session) for category in self.getFieldNames(): # At the moment some fields are predefined if category == "project": projects = splitRecord( config.get(projectSection, 'project_options', default='')) self.validValues['project'] = [x[0] for x in projects] elif category == "model": models = session.query(Model).filter_by( project=self.name).all() self.validValues['model'] = [x.name for x in models] elif category == "experiment": experiments = session.query(Experiment).filter_by( project=self.name).all() self.validValues['experiment'] = [x.name for x in experiments] elif category == "creator": creators = splitRecord( config.get(projectSection, 'creator_options', default='')) self.validValues['creator'] = [x[0] for x in creators] self.validMaps['creator'] = genMap(creators) elif category == "publisher": publishers = splitRecord( config.get(projectSection, 'publisher_options', default='')) self.validValues['publisher'] = [x[0] for x in publishers] self.validMaps['publisher'] = genMap(publishers) else: categoryType = self.getFieldType(category) if categoryType == ENUM: option = category + "_options" self.validValues[category] = splitLine( config.get(projectSection, option), ',') self.context[category] = '' session.close()
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, **context): """ Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables: - dataset - dataset_version - file - file_version - dataset_file_version - file_variable (partially) - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. fileIterator An iterator that returns an iteration of (file_path, file_size), where file_size is an integer. dbSession A database Session. handler Project handler cfHandler A CF handler instance aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. offline Boolean, True if the files are offline, cannot be scanned. operation Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion. extraFields Extra fields dictionary, as from ``readDatasetMap``. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment String comment on the dataset version. If the dataset version is not increased, the comment is ignored. useVersion=-1: Integer version number of the dataset version to modify. By default the latest version is modified. forceRescan Boolean, if True force all files to be rescanned on an update. context A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset. """ session = dbSession() # Get configuration options related to the scan configOptions = {} config = getConfig() if config is not None: section = 'project:%s'%context.get('project') vlstring = config.get(section, 'variable_locate', default=None) if vlstring is not None: fields = splitLine(vlstring) varlocate = [s.split(',') for s in fields] else: varlocate = None line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None checksumType = None versionByDate = config.getboolean(section, 'version_by_date', default=False) else: varlocate = None checksumClient = None checksumType = None versionByDate = False configOptions['variable_locate'] = varlocate configOptions['checksumClient'] = checksumClient configOptions['checksumType'] = checksumType # Check if the dataset / version is already in the database dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is not None: if operation==CREATE_OP: operation = REPLACE_OP else: if operation in [UPDATE_OP, REPLACE_OP]: operation = CREATE_OP elif operation in [DELETE_OP, RENAME_OP]: raise ESGPublishError("No such dataset: %s"%datasetName) # Cannot add online files to offline dataset, and vice versa if dset is not None and dset.offline != offline: if dset.offline: raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name) else: raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name) # Cannot publish a replica with the same ID as a local dataset and vice versa if dset is not None and dset.master_gateway != masterGateway: if dset.master_gateway is None: raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name) else: raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name) createTime = datetime.datetime.now() # DatasetVersion creation_time fobjs = None pathlist = [item for item in fileIterator] if operation==CREATE_OP: # Create a new dataset info("Creating dataset: %s"%datasetName) dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) session.add(dset) # Create an initial dataset version existingVersion = 0 eventFlag = CREATE_DATASET_EVENT addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context) elif operation in [UPDATE_OP, REPLACE_OP]: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, **context) elif operation==RENAME_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) elif operation==DELETE_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) else: raise ESGPublishError("Invalid dataset operation: %s"%`operation`) # Create a new dataset version if necessary if keepVersion: if existingVersion<=0: newVersion = getInitialDatasetVersion(versionByDate) else: newVersion = existingVersion elif newVersion is None: newVersion = getNextDatasetVersion(existingVersion, versionByDate) dset.reaggregate = False # Add a new version if addNewVersion and newVersion>existingVersion: datasetTechNotes = datasetTechNotesTitle = None if hasattr(dset, "dataset_tech_notes"): datasetTechNotes = dset.dataset_tech_notes if hasattr(dset, "dataset_tech_notes_title"): datasetTechNotesTitle = dset.dataset_tech_notes_title newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle) info("New dataset version = %d"%newDsetVersionObj.version) for var in dset.variables: session.delete(var) newDsetVersionObj.files.extend(fobjs) event = Event(datasetName, newDsetVersionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True # Keep the current (latest) version elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]: versionObj.deleteChildren(session) versionObj.reset(creation_time=createTime, comment=comment) info("Keeping dataset version = %d"%versionObj.version) for var in dset.variables: session.delete(var) session.commit() versionObj.files.extend(fobjs) event = Event(datasetName, versionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True elif masterGateway is not None: # Force version set on replication info("Dataset version = %d"%newVersion) dset.setVersion(newVersion) event = Event(datasetName, newVersion, eventFlag) dset.events.append(event) info("Adding file info to database") session.commit() session.close() return dset
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:' + self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions( 'experiment', experimentOptions) if idFormat.find( '%(experiment)s') != -1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x, y: x + '|' + y, experimentOptions) experimentPattern = r'(?P<experiment>%s)' % optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1] != '$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value != context[key]: warning("Dataset ID=%s, but %s=%s" % (datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning( "Dataset ID: %s does not match the dataset_id format(s): %s" % (datasetName, ` datasetIdFormats `)) return context
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:' + self.name dataset_id_formats = splitLine( config.get(section, 'dataset_id', raw=True)) idfields = [ re.findall(_patpat, format) for format in dataset_id_formats ] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1] == '/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId( 'dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s' % drsversion except: allfields = reduce(lambda x, y: set(x) + set(y), idfields) missingFields = list((set(allfields) - set(groupdict.keys())) - set(config.options(section))) raise ESGPublishError( "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s" % ( ` missingFields `, nodepath)) else: warning( "Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini" ) datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0): warning( "Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini" ) return datasetMap
def __init__(self, url, certFile, certs_location, keyFile=None, debug=False): """ Create a RESTful ESGF Publication Service proxy. The proxy supports both the current and legacy publication services. The current API is defined at: http://esgf.org/wiki/ESGF_Publishing_Services See http://esgf.org/esg-search-site/LegacyPublishingService.html for definition of the legacy API. url Publication service URL. For example, 'https://pcmdi9.llnl.gov/esg-search/ws/'. Note that the actual service operation will be appended to the URL. certFile Client certificate file in PEM format, for client cert authentication. keyfile Client key file, if different from certFile. debug: Boolean flag. If True, write debugging information. """ self.service_type = 'REST' self.url = url if self.url[-1] != '/': self.url += '/' self.harvestUrl = urljoin(self.url, 'harvest') self.deleteUrl = urljoin(self.url, 'delete') self.retractUrl = urljoin(self.url, 'retract') self.certFile = certFile if keyFile is not None: self.keyFile = keyFile else: self.keyFile = certFile outdir=os.path.dirname(certFile) concat_certs=outdir+'/concatenatedcerts' # need to concatenate the certs bundle to the cert to use as the CA context. Thanks pchengi for the initial fix! # check if there is a setting, if none, use a default config = getConfig() certs_bundle_location = DEFAULT_CERTS_BUNDLE try: certs_bundle_location = config.get('DEFAULT', 'esg_certificates_bundle') except: certs_bundle_location = DEFAULT_CERTS_BUNDLE files=[certFile,certs_bundle_location] with open(concat_certs,'w') as outfile: for certf in files: with open(certf, 'r') as file: outfile.write(file.read()) outfile.write('\n') self.certs_location = concat_certs self.debug = debug self.status = 0 self.message = ''
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:'+self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions) if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions) experimentPattern = r'(?P<experiment>%s)'%optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1]!='$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value!=context[key]: warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`)) return context
def validateContext(self, context): """ Validate context values: - Mandatory values must be non-blank, and if enumerated have a valid value - If enumerated, non-mandatory values must be blank or have a valid value otherwise if enumerated the field must be either be blank or one of the valid values Raises ESGPublishError if a validation error occurs If the validate configuration option is set to False in the project section, validation always succeeds. """ if not self.validate: return for key in context.keys(): fieldType = self.getFieldType(key) # Ignore non-configured fields if fieldType is None: continue isenum = (fieldType==ENUM) if isenum: options = self.getFieldOptions(key) value = context[key] config = getConfig() project_section = 'project:%s' % self.name delimiter = config.get(project_section, key + "_delimiter", default="") if value in ['', None]: # if value not in default context, try to get it from key_pattern or *_map option = '%s_pattern' % key if config.has_option(project_section, option): value = config.get(project_section, option, False, context) context[key] = value elif config.has_option(project_section, 'maps'): for map_option in splitLine(config.get(project_section, 'maps', default=''), ','): from_keys, to_keys, value_dict = splitMap(config.get(project_section, map_option)) if key in to_keys: from_values = tuple(context[k] for k in from_keys) to_values = value_dict[from_values] value = to_values[to_keys.index(key)] context[key] = value if self.isMandatory(key): if value in ['', None]: if isenum: raise ESGInvalidMandatoryField("Mandatory field '%s' not set, must be one of %s"%(key, `options`)) else: raise ESGInvalidMandatoryField("Mandatory field '%s' not set"%key) elif isenum and not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGInvalidMandatoryField("Invalid value of mandatory field '%s': %s, must be one of %s"%(key, value, `validOptions`)) elif isenum: # non-mandatory field options += ['', None] if not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGPublishError("Invalid value of '%s': %s, must be one of %s"%(key, value, `validOptions`))
threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e) if publish: # Create the web service proxy config = getConfig() threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') servicePollingDelay = config.getfloat('DEFAULT','hessian_service_polling_delay') spi = servicePollingIterations = config.getint('DEFAULT','hessian_service_polling_iterations') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: # REST service spi = 1 serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path # todo refactoring these could loaded upfront in the constructor config = getConfig() project_section = 'project:' + self.name project_config_section = 'config:' + self.name min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0") min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0") data_specs_version = config.get(project_config_section, "data_specs_version", default="master") cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH) force_validation = config.getboolean(project_config_section, "force_validation", default=False) cmor_table_subdirs = config.getboolean(project_config_section, "cmor_table_subdirs", default=False) if not force_validation: if self.replica: info("skipping PrePARE for replica (file %s)" % f) return try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: file_cmor_version = None debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f) passed_cmor = False if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!'%(f, file_cmor_version)) passed_cmor = True try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError("File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError("File %s missing required variable_id global attribute" % f) # data_specs_version drives CMOR table fetching # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini") # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini try: file_data_specs_version = fileobj.getAttribute('data_specs_version', None) except Exception as e: raise ESGPublishError("File %s missing required data_specs_version global attribute"%f) if not compareLibVersions(min_ds_version, file_data_specs_version): raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version)) # at this point the file has the correct data specs version. # if also was CMORized and has the correct version tag, we can exit if (not force_validation) and passed_cmor: return if data_specs_version == "file": data_specs_version = file_data_specs_version table_dir = getTableDir(cmor_table_path, data_specs_version, cmor_table_subdirs) debug("Validating {} using tables dir: {}".format(f, table_dir)) try: process = validator.checkCMIP6(table_dir) if process is None: raise ESGPublishError("File %s failed the CV check - object create failure"%f) process.ControlVocab(f) except: raise ESGPublishError("File %s failed the CV check"%f)
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path config = getConfig() projectSection = 'project:' + self.name min_cmor_version = config.get(projectSection, "min_cmor_version", default="0.0.0") file_cmor_version = "0.0.0" try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: debug( 'File %s missing cmor_version attribute; will proceed with PrePARE check' % f) if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!"' % (f, file_cmor_version)) return # PrePARE is going to handle the CF check now # min_cf_version = config.get(projectSection, "min_cf_version", defaut="") # if len(min_cf_version) == 0: # raise ESGPublishError("Minimum CF version not set in esg.ini") # fakeversion = ["cfchecker.py", "-v", min_cf_version # , "foo"] # (badc,coards,uploader,useFileName,standardName,areaTypes,udunitsDat,version,files)=getargs(fakeversion) # CF_Chk_obj = CFChecker(uploader=uploader, useFileName=useFileName, badc=badc, coards=coards, cfStandardNamesXML=standardName, cfAreaTypesXML=areaTypes, udunitsDat=udunitsDat, version=version) # rc = CF_Chk_obj.checker(f) # if (rc > 0): # raise ESGPublishError("File %s fails CF check"%f) file_data_specs_version = None try: file_data_specs_version = fileobj.getAttribute( 'data_specs_version', None) except Exception as e: raise ESGPublishError( "File %s missing required data_specs_version global attribute" % f) table = None try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError( "File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError( "File %s missing required variable_id global attribute" % f) project_section = 'config:cmip6' cmor_table_path = "" try: cmor_table_path = config.get(projectSection, "cmor_table_path", defaut="") except: debug("Missing cmor_table_path setting. Using default location") if cmor_table_path == "": cmor_table_path = DEFAULT_CMOR_TABLE_PATH checkAndUpdateRepo(cmor_table_path, self, file_data_specs_version) table_file = cmor_table_path + '/CMIP6_' + table + '.json' fakeargs = ['--variable', variable_id, table_file, f] parser = argparse.ArgumentParser(prog='esgpublisher') parser.add_argument('--variable') parser.add_argument('cmip6_table', action=validator.JSONAction) parser.add_argument('infile', action=validator.CDMSAction) parser.add_argument('outfile', nargs='?', help='Output file (default stdout)', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args(fakeargs) # print "About to CV check:", f try: process = validator.checkCMIP6(args) if process is None: raise ESGPublishError( "File %s failed the CV check - object create failure" % f) process.ControlVocab() except: raise ESGPublishError("File %s failed the CV check" % f)
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:'+self.name dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True)) idfields = [re.findall(_patpat, format) for format in dataset_id_formats] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1]=='/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s'%drsversion except: allfields = reduce(lambda x,y: set(x)+set(y), idfields) missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section))) raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath)) else: warning("Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini") datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0 ): warning("Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini") return datasetMap
threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e) if publish: # Create the web service proxy config = getConfig() threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') servicePollingDelay = config.getfloat('DEFAULT','hessian_service_polling_delay') spi = servicePollingIterations = config.getint('DEFAULT','hessian_service_polling_iterations') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: # REST service spi = 1 serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)
def __init__(self, url, certFile, certs_location, keyFile=None, debug=False): """ Create a RESTful ESGF Publication Service proxy. The proxy supports both the current and legacy publication services. The current API is defined at: http://esgf.org/wiki/ESGF_Publishing_Services See http://esgf.org/esg-search-site/LegacyPublishingService.html for definition of the legacy API. url Publication service URL. For example, 'https://pcmdi9.llnl.gov/esg-search/ws/'. Note that the actual service operation will be appended to the URL. certFile Client certificate file in PEM format, for client cert authentication. keyfile Client key file, if different from certFile. debug: Boolean flag. If True, write debugging information. """ self.service_type = 'REST' self.url = url if self.url[-1] != '/': self.url += '/' self.harvestUrl = urljoin(self.url, 'harvest') self.deleteUrl = urljoin(self.url, 'delete') self.retractUrl = urljoin(self.url, 'retract') self.certFile = certFile if keyFile is not None: self.keyFile = keyFile else: self.keyFile = certFile outdir = os.path.dirname(certFile) concat_certs = outdir + '/concatenatedcerts' # need to concatenate the certs bundle to the cert to use as the CA context. Thanks pchengi for the initial fix! # check if there is a setting, if none, use a default config = getConfig() certs_bundle_location = DEFAULT_CERTS_BUNDLE try: certs_bundle_location = config.get('DEFAULT', 'esg_certificates_bundle') except: certs_bundle_location = DEFAULT_CERTS_BUNDLE files = [certs_bundle_location, certFile] with open(concat_certs, 'w') as outfile: for certf in files: with open(certf, 'r') as file: outfile.write(file.read()) outfile.write('\n') self.certs_location = concat_certs self.debug = debug self.status = 0 self.message = ''
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: service_certs_location = getServiceCertsLoc() serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: if service.service_type == 'REST': error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName) continue datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName