Ejemplo n.º 1
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            cdms_file = cdms_open(self.path)
            for key in splitLine(config.get(projectSection, config_key), ','):

                # check for mapped keys
                if ':' in key:
                    parts = key.split(':')
                    value = cdms_file.__getattribute__(parts[0])
                    result[parts[1]] = value

                else:
                    result[key] = cdms_file.__getattribute__(key)

        return result
    def __init__(self, cdf, path):

        if (cdf is None):
            self.noncd = True
            self.file = {}
            self.path = path
        else:
            # load config and set it based on
            config = getConfig()
            projectSection = 'project:input4mips'
            variables_none = config.get(projectSection,
                                        "variables_none",
                                        default="false")

            if variables_none == "false":
                self.noncd = False
                CdunifFormatHandler.__init__(self, cdf, path)
            elif variables_none == "attr":
                CdunifFormatHandler.__init__(self, cdf, path)
                self.attr_only = True
                self.noncd = True
            else:  # assume "true"
                self.noncd = True
                self.file = {}
                self.path = path
                self.attr_only = False
Ejemplo n.º 3
0
    def __init__(self, cdf, path):
        
        self.attr_only = False
        if (cdf is None ):
            self.noncd = True
            self.file = {}
            self.path = path
        else:
            # load config and set it based on 
            config = getConfig()
            projectSection = 'project:dream'
            variables_none = config.get(projectSection, "variables_none", default="false")

            if variables_none == "false":
                self.noncd = False
                CdunifFormatHandler.__init__(self, cdf, path)
            elif variables_none == "attr":
                CdunifFormatHandler.__init__(self, cdf, path)
                self.attr_only = True
                self.noncd = True
            else:  # assume "true"
                self.noncd = True
                self.file = {}
                self.path = path
                self.attr_only = False
Ejemplo n.º 4
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            cdms_file = cdms_open(self.path)
            for key in splitLine(config.get(projectSection, config_key), ','):
                
                # check for mapped keys
                if ':' in key:
                    parts = key.split(':')
                    value = cdms_file.__getattribute__(parts[0])
                    result[parts[1]] = value

                else:
                    result[key] = cdms_file.__getattribute__(key)

        return result
Ejemplo n.º 5
0
 def getDatasetIdFields(self):
     """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``.
     """
     config = getConfig()
     section = 'project:'+self.name
     dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True))
     idfields = [re.findall(_patpat, format) for format in dataset_id_formats]
     return idfields, dataset_id_formats
Ejemplo n.º 6
0
    def generateDatasetId(self, option, idfields, groupdict, multiformat=None):
        """
        Generate a dataset ID from a config file option.

        Returns the ID.

        option
          Name of the dataset ID option

        idfields
          List of string fields needed to generate the ID, or a list of lists
          if multiformat is not None.

        groupdict
          Dictionary to generate the ID from.

        multiformat
          Set for multi-field formats, such as dataset_id.

        """
        config = getConfig()
        section = 'project:' + self.name
        mapdict = self.getMaps()
        keys = groupdict.keys()

        foundValue = False
        if multiformat is not None:
            for fieldlist, format in zip(idfields, multiformat):
                try:
                    result = self.generateDatasetId_1(option,
                                                      fieldlist,
                                                      groupdict,
                                                      config,
                                                      section,
                                                      mapdict,
                                                      keys,
                                                      format=format)
                except:
                    continue
                else:
                    foundValue = True
                    break
        else:
            try:
                result = self.generateDatasetId_1(option, idfields, groupdict,
                                                  config, section, mapdict,
                                                  keys)
            except:
                pass
            else:
                foundValue = True

        if not foundValue:
            raise ESGPublishError(
                "Cannot generate a value for option %s, please specify the dataset id explicitly."
                % option)

        return result
Ejemplo n.º 7
0
    def initializeFields(self, Session):
        """Initialize field names and options based on the configuration file."""
        from esgcet.model import Model, Experiment

        config = getConfig()
        projectSection = "project:" + self.name
        categoryOption = config.get(projectSection, "categories")
        categorySpecs = splitRecord(categoryOption)
        for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs:
            categoryType = getCategoryType(categoryTypeS)
            isMandatory = getBoolean(isMandatoryS)
            isThreddsProperty = getBoolean(isThreddsPropertyS)
            displayOrder = string.atoi(displayOrderS)
            self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder)

        categoryDefaultsOption = config.get(projectSection, "category_defaults", default=None, raw=True)
        if categoryDefaultsOption is not None:
            categoryDefaultsSpecs = splitRecord(categoryDefaultsOption)
            for category, categoryDefault in categoryDefaultsSpecs:
                self.categoryDefaults[category] = categoryDefault

        session = Session()

        # Find any new experiments. This allows experiments to be added to the config file without
        # running esginitialize.
        if self.fieldNames.has_key("experiment") and self.fieldNames["experiment"][WIDGET_TYPE] == ENUM:
            initializeExperiments(config, self.name, session)

        for category in self.getFieldNames():
            # At the moment some fields are predefined
            if category == "project":
                projects = splitRecord(config.get(projectSection, "project_options", default=""))
                self.validValues["project"] = [x[0] for x in projects]
            elif category == "model":
                models = session.query(Model).filter_by(project=self.name).all()
                self.validValues["model"] = [x.name for x in models]
            elif category == "experiment":
                experiments = session.query(Experiment).filter_by(project=self.name).all()
                self.validValues["experiment"] = [x.name for x in experiments]
            elif category == "creator":
                creators = splitRecord(config.get(projectSection, "creator_options", default=""))
                self.validValues["creator"] = [x[0] for x in creators]
                self.validMaps["creator"] = genMap(creators)
            elif category == "publisher":
                publishers = splitRecord(config.get(projectSection, "publisher_options", default=""))
                self.validValues["publisher"] = [x[0] for x in publishers]
                self.validMaps["publisher"] = genMap(publishers)
            else:
                categoryType = self.getFieldType(category)
                if categoryType == ENUM:
                    option = category + "_options"
                    self.validValues[category] = splitLine(config.get(projectSection, option), ",")

            self.context[category] = ""

        session.close()
Ejemplo n.º 8
0
 def _colors_are_disabled(self):
     if self._disable_colors == None:
         config = getConfig()
         if config:
             self._disable_colors = \
                 config.getboolean('DEFAULT', 'disable_colors',
                                   default=False)
         else:
             return False  # allow colors until config is loaded
     return self._disable_colors                
Ejemplo n.º 9
0
 def _colors_are_disabled(self):
     if self._disable_colors == None:
         config = getConfig()
         if config:
             self._disable_colors = \
                 config.getboolean('DEFAULT', 'disable_colors',
                                   default=False)
         else:
             return False  # allow colors until config is loaded
     return self._disable_colors
Ejemplo n.º 10
0
def getHessianServiceURL(project_config_section=None):
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = None
    if project_config_section and config.has_section(project_config_section):
        serviceURL = config.get(project_config_section, 'hessian_service_url', default=None)
    if not serviceURL:
        serviceURL = config.get('DEFAULT', 'hessian_service_url')

    return serviceURL
Ejemplo n.º 11
0
def getHessianServiceURL(project_config_section=None):
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = None
    if project_config_section and config.has_section(project_config_section):
        serviceURL = config.get(project_config_section, 'hessian_service_url', default=None)
    if not serviceURL:
        serviceURL = config.get('DEFAULT', 'hessian_service_url')

    return serviceURL
Ejemplo n.º 12
0
 def getDatasetIdFields(self):
     """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``.
     """
     config = getConfig()
     section = 'project:' + self.name
     dataset_id_formats = splitLine(
         config.get(section, 'dataset_id', raw=True))
     idfields = [
         re.findall(_patpat, format) for format in dataset_id_formats
     ]
     return idfields, dataset_id_formats
Ejemplo n.º 13
0
def getRestServiceURL():
    """Get the configured value of rest_service_url. If not set,
    derive host from hessian_service_url and use '/esg-search/ws' as the path.
    """

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'rest_service_url', default=None)
    if serviceURL is None:
        hessianServiceURL = config.get('DEFAULT', 'hessian_service_url')
        host = urlparse.urlparse(hessianServiceURL).netloc
        serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', ''))
    return serviceURL
Ejemplo n.º 14
0
    def validateFile(self, fileobj):
        """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler."""
        config = getConfig()
        projectSection = 'project:'+self.name

        if config.has_option(projectSection, 'min_cmor_version'):
            min_cmor_version = config.get(projectSection, "min_cmor_version", default="0.0.0")

            file_cmor_version = fileobj.getAttribute('cmor_version', None)

            if not compareLibVersions(min_cmor_version, file_cmor_version):
                raise ESGInvalidMetadataFormat("file " + self.path  + " cmor version = " + file_cmor_version  +  ", running checks - minimum = " + min_cmor_version )
Ejemplo n.º 15
0
def getRestServiceURL():
    """Get the configured value of rest_service_url. If not set,
    derive host from hessian_service_url and use '/esg-search/ws' as the path.
    """

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'rest_service_url', default=None)
    if serviceURL is None:
        hessianServiceURL = config.get('DEFAULT', 'hessian_service_url')
        host = urlparse.urlparse(hessianServiceURL).netloc
        serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', ''))
    return serviceURL
Ejemplo n.º 16
0
def getRemoteMetadataService(serviceUrl=None):
    """Get the remote metadata service.

    Returns the service object.
    """
    config = getConfig()
    if serviceUrl is None:
        remoteMetadataServiceUrl = config.get('DEFAULT', 'hessian_service_remote_metadata_url')
    else:
        remoteMetadataServiceUrl = serviceUrl
    serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
    service = Hessian(remoteMetadataServiceUrl, 80, debug=serviceDebug)
    return service
Ejemplo n.º 17
0
    def generateDatasetId(self, option, idfields, groupdict, multiformat=None):
        """
        Generate a dataset ID from a config file option.

        Returns the ID.

        option
          Name of the dataset ID option

        idfields
          List of string fields needed to generate the ID, or a list of lists
          if multiformat is not None.

        groupdict
          Dictionary to generate the ID from.

        multiformat
          Set for multi-field formats, such as dataset_id.

        """
        config = getConfig()
        section = "project:" + self.name
        mapdict = self.getMaps()
        keys = groupdict.keys()

        foundValue = False
        if multiformat is not None:
            for fieldlist, format in zip(idfields, multiformat):
                try:
                    result = self.generateDatasetId_1(
                        option, fieldlist, groupdict, config, section, mapdict, keys, format=format
                    )
                except:
                    continue
                else:
                    foundValue = True
                    break
        else:
            try:
                result = self.generateDatasetId_1(option, idfields, groupdict, config, section, mapdict, keys)
            except:
                pass
            else:
                foundValue = True

        if not foundValue:
            raise ESGPublishError(
                "Cannot generate a value for option %s, please specify the dataset id explicitly." % option
            )

        return result
Ejemplo n.º 18
0
def getHessianServiceURL():
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'hessian_service_url')

    gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None)
    if gatewayServiceRoot is not None:
        dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL)
        dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot)
        if serviceHost!=envServiceHost:
            warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot))

    return serviceURL
Ejemplo n.º 19
0
def getHessianServiceURL():
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'hessian_service_url')

    gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None)
    if gatewayServiceRoot is not None:
        dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL)
        dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot)
        if serviceHost!=envServiceHost:
            warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot))

    return serviceURL
Ejemplo n.º 20
0
def getRemoteMetadataService(serviceUrl=None):
    """Get the remote metadata service.

    Returns the service object.
    """
    config = getConfig()
    if serviceUrl is None:
        remoteMetadataServiceUrl = config.get(
            'DEFAULT', 'hessian_service_remote_metadata_url')
    else:
        remoteMetadataServiceUrl = serviceUrl
    serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
    service = Hessian(remoteMetadataServiceUrl, 80, debug=serviceDebug)
    return service
Ejemplo n.º 21
0
def pollDatasetPublicationStatus(datasetName, Session, service=None):
    """
    Get the current dataset publication status by polling the gateway.

    Returns the current dataset publication status.
    
    datasetNames
      A list of string dataset names.

    Session
      A database Session.

    service
      Web service proxy instance. If None, the service is created.

    """

    session = Session()
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is None:
        messaging.error("Dataset not found: %s" % datasetName)
        session.close()
        return PUBLISH_FAILED_EVENT

    status = dset.get_publication_status()
    if status != START_PUBLISH_DATASET_EVENT:
        session.close()
        return status

    if service is None:
        config = getConfig()
        serviceURL = getHessianServiceURL()
        servicePort = config.getint('DEFAULT', 'hessian_service_port')
        serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        service = Hessian(serviceURL,
                          servicePort,
                          key_file=serviceKeyfile,
                          cert_file=serviceCertfile,
                          debug=serviceDebug)

    try:
        statusObj = PublicationStatus(dset.status_id, service)
    except socket.error, e:
        raise ESGPublishError(
            "Socket error: %s\nIs the proxy certificate %s valid?" %
            ( ` e `, service._cert_file))
Ejemplo n.º 22
0
def getRestServiceURL(project_config_section=None):
    """Get the configured value of rest_service_url. If not set,
    derive host from hessian_service_url and use '/esg-search/ws' as the path.
    """

    config = getConfig()
    hessianServiceURL = None
    # get project specific hessian service url
    if serviceURL is None:
        if project_config_section and config.has_section(project_config_section):
            hessianServiceURL = config.get(project_config_section, 'hessian_service_url', default=None)
        if not hessianServiceURL:
            hessianServiceURL = config.get('DEFAULT', 'hessian_service_url')
        host = urlparse.urlparse(hessianServiceURL).netloc
        serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', ''))
    return serviceURL
Ejemplo n.º 23
0
    def validateFile(self, fileobj):
        """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler."""
        config = getConfig()
        projectSection = 'project:' + self.name

        if config.has_option(projectSection, 'min_cmor_version'):
            min_cmor_version = config.get(projectSection,
                                          "min_cmor_version",
                                          default="0.0.0")

            file_cmor_version = fileobj.getAttribute('cmor_version', None)

            if not compareLibVersions(min_cmor_version, file_cmor_version):
                raise ESGInvalidMetadataFormat(
                    "file " + self.path + " cmor version = " +
                    file_cmor_version + ", running checks - minimum = " +
                    min_cmor_version)
Ejemplo n.º 24
0
def getRestServiceURL(project_config_section=None):
    """Get the configured value of rest_service_url. If not set,
    derive host from hessian_service_url and use '/esg-search/ws' as the path.
    """

    config = getConfig()
    hessianServiceURL = None
    serviceURL = None
    # get project specific hessian service url
    if serviceURL is None:
        if project_config_section and config.has_section(project_config_section):
            hessianServiceURL = config.get(project_config_section, 'hessian_service_url', default=None)
        if not hessianServiceURL:
            hessianServiceURL = config.get('DEFAULT', 'hessian_service_url')
        host = urlparse.urlparse(hessianServiceURL).netloc
        serviceURL = urlparse.urlunparse(('https', host, '/esg-search/ws', '', '', ''))
    return serviceURL
Ejemplo n.º 25
0
def reinitializeLAS():
    """
    Reinitialize the Live Access Server. This forces the catalogs to be reread.

    Returns the HTML string returned from the URL.

    """
    config = getConfig()
    if config is None:
        raise ESGPublishError("No configuration file found.")

    lasReinitUrl = config.get("DEFAULT", "las_reinit_url")
    info("Reinitializing LAS server")

    try:
        reinitResult = readThreddsWithAuthentication(lasReinitUrl, config)
    except Exception, e:
        raise ESGPublishError("Error reinitializing the Live Access Server: %s" % e)
Ejemplo n.º 26
0
 def getDirectoryFormatFilters(self):
     """Return a list of regular expression filters associated with the ``directory_format`` option
     in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = "project:" + self.name
     directory_format = config.get(section, "directory_format", raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace("\.", "__ESCAPE_DOT__")
         pat3 = pat2.replace(".", r"\.")
         pat4 = pat3.replace("__ESCAPE_DOT__", r"\.")
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r"(?P<\1>[^/]*)", pat4)
         filter = "^" + pattern + "$"
         filters.append(filter)
     return filters
Ejemplo n.º 27
0
def reinitializeLAS():
    """
    Reinitialize the Live Access Server. This forces the catalogs to be reread.

    Returns the HTML string returned from the URL.

    """
    config = getConfig()
    if config is None:
        raise ESGPublishError("No configuration file found.")

    lasReinitUrl = config.get('DEFAULT', 'las_reinit_url')
    info("Reinitializing LAS server")

    try:
        reinitResult = readThreddsWithAuthentication(lasReinitUrl, config)
    except Exception, e:
        raise ESGPublishError(
            "Error reinitializing the Live Access Server: %s" % e)
Ejemplo n.º 28
0
    def __init__(self, cdf, path):
        
        if (cdf is None ):
            self.noncd = True
            self.file = {}
            self.path = path
        else:
            # load config and set it based on 
            config = getConfig()
            projectSection = 'project:'
            variables_none = config.get(projectSection, "variables_none", default="false")

            if variables_none == "false":
                self.noncd = False
                CdunifFormatHandler.__init__(self, cdf, path)
            else:
                self.noncd = True
                self.file = {}
                self.path = path
Ejemplo n.º 29
0
 def getFilters(self, option='directory_format'):
     """Return a list of regular expression filters associated with the option in the configuration file.
      This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = 'project:' + self.name
     directory_format = config.get(section, option, raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace('\.', '__ESCAPE_DOT__')
         pat3 = pat2.replace('.', r'\.')
         pat4 = pat3.replace('__ESCAPE_DOT__', r'\.')
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4)
         filter = '^' + pattern + '$'
         filters.append(filter)
     return filters
Ejemplo n.º 30
0
 def getFilters(self, option='directory_format'):
     """Return a list of regular expression filters associated with the option in the configuration file.
      This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = 'project:'+self.name
     directory_format = config.get(section, option, raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace('\.','__ESCAPE_DOT__')
         pat3 = pat2.replace('.', r'\.')
         pat4 = pat3.replace('__ESCAPE_DOT__', r'\.')
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4)
         filter = '^'+pattern+'$'
         filters.append(filter)
     return filters
Ejemplo n.º 31
0
def pollDatasetPublicationStatus(datasetName, Session, service=None):
    """
    Get the current dataset publication status by polling the gateway.

    Returns the current dataset publication status.
    
    datasetNames
      A list of string dataset names.

    Session
      A database Session.

    service
      Web service proxy instance. If None, the service is created.

    """

    session = Session()
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is None:
        messaging.error("Dataset not found: %s"%datasetName)
        session.close()
        return PUBLISH_FAILED_EVENT
    
    status = dset.get_publication_status()
    if status!=START_PUBLISH_DATASET_EVENT:
        session.close()
        return status

    if service is None:
        config = getConfig()
        serviceURL = getHessianServiceURL()
        servicePort = config.getint('DEFAULT','hessian_service_port')
        serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
    
    try:
        statusObj = PublicationStatus(dset.status_id, service)
    except socket.error, e:
        raise ESGPublishError("Socket error: %s\nIs the proxy certificate %s valid?"%(`e`, service._cert_file))
Ejemplo n.º 32
0
def getServiceCertsLoc():
    try:
        service_certs_location = getConfig().get(
            'DEFAULT', 'hessian_service_certs_location')

    except:
        home = os.environ.get("HOME")
        if home is not None:
            service_certs_location = home + DEFAULT_CERTS_LOCATION_SUFFIX

    if service_certs_location is None:
        raise ESGPublishError(
            "hessian_service_certs_location needs to be set in esg.ini")

    if not os.path.exists(service_certs_location):
        raise ESGPublishError(
            "Error: " + service_certs_location +
            " does not exist.  Please run myproxy-logon with -b to bootstrap the certificates, or set an alternate location using the hessian_service_certs_location setting in esg.ini"
        )
    return service_certs_location
Ejemplo n.º 33
0
 def getMaps(self):
     """Get a dictionary of maps from the project section.
     """
     config = getConfig()
     section = 'project:'+self.name
     if self.mapdict is None:
         mapdict = {}
         projectMaps = splitLine(config.get(section, 'maps', default=""), ',')
         for option in projectMaps:
             if option=="":
                 continue
             fromcat, tocat, projectMap = splitMap(config.get(section, option))
             for to_index, field in enumerate(tocat):
                 value = (fromcat, projectMap, to_index)
                 if mapdict.has_key(field):
                     mapdict[field].append(value)
                 else:
                     mapdict[field] = [value]
         self.mapdict = mapdict
     return self.mapdict
Ejemplo n.º 34
0
    def generateNameFromContext(self, parameter, **extraParams):
        """
        Generate a name from a config file parameter, relative to the current
        handler context. Mapped format strings are also resolved.

        Returns a string name.

        parameter
          The configuration file option, e.g., 'dataset_id'

        extraParams
          Extra options, added to the current context before resolving the name.
          On return self.context is not modified.
        """
        tempcontext = {}
        tempcontext.update(self.context)
        tempcontext.update(extraParams)
        section = 'project:'+self.name
        config = getConfig()
        generatedName = self.generateNameFromContext_1(parameter, config, section, 1, **tempcontext)
        return generatedName
Ejemplo n.º 35
0
    def initializeFields(self, Session):
        BasicHandler.initializeFields(self, Session)
        config = getConfig()
        projectSection = 'project:'+self.name

        # Enumerated value validation is case-insensitive
        lowerCaseValidValues = {}
        for field, valueList in self.validValues.items():
            lowerCaseValidList = []
            validDict = {}
            for value in valueList:
                if value is not None:
                    lvalue = value.lower()
                else:
                    lvalue = None
                lowerCaseValidList.append(lvalue)
                validDict[lvalue] = value
            lowerCaseValidValues[field] = lowerCaseValidList
            self.caseSensitiveValidValues[field] = validDict
        self.validValues = lowerCaseValidValues
        self.checkFilenames = config.getboolean(projectSection, 'thredds_check_file_names', default=True)
Ejemplo n.º 36
0
 def getMaps(self):
     """Get a dictionary of maps from the project section.
     """
     config = getConfig()
     section = 'project:' + self.name
     if self.mapdict is None:
         mapdict = {}
         projectMaps = splitLine(config.get(section, 'maps', default=""),
                                 ',')
         for option in projectMaps:
             if option == "":
                 continue
             fromcat, tocat, projectMap = splitMap(
                 config.get(section, option))
             for to_index, field in enumerate(tocat):
                 value = (fromcat, projectMap, to_index)
                 if mapdict.has_key(field):
                     mapdict[field].append(value)
                 else:
                     mapdict[field] = [value]
         self.mapdict = mapdict
     return self.mapdict
Ejemplo n.º 37
0
    def generateNameFromContext(self, parameter, **extraParams):
        """
        Generate a name from a config file parameter, relative to the current
        handler context. Mapped format strings are also resolved.

        Returns a string name.

        parameter
          The configuration file option, e.g., 'dataset_id'

        extraParams
          Extra options, added to the current context before resolving the name.
          On return self.context is not modified.
        """
        tempcontext = {}
        tempcontext.update(self.context)
        tempcontext.update(extraParams)
        section = 'project:' + self.name
        config = getConfig()
        generatedName = self.generateNameFromContext_1(parameter, config,
                                                       section, 1,
                                                       **tempcontext)
        return generatedName
Ejemplo n.º 38
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            for key in splitLine(config.get(projectSection, config_key), ','):
                result[key] = cdfile.getAttribute(key, None)

        return result
Ejemplo n.º 39
0
def main():

    """Uses the esg.ini file options:
        - thredds_file_services
              to get a Globus endpoint UUID
        - thredds_root
              to find a directory with THREDDS xml catalogs
    """

    loadConfig(None)
    config = getConfig()
    if config is None:
        raise ESGPublishError('No configuration file found')

    # By default thredds_root is: /esg/content/thredds/esgcet
    thredds_root = config.get('DEFAULT', 'thredds_root')
    thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services')
    # parameters needed to re-harvest the THREDDS catalogs
    thredds_url = config.get('DEFAULT', 'thredds_url')
    hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile')
    hessian_service_url = config.get('DEFAULT', 'hessian_service_url')
    esgf_harvesting_service_url = hessian_service_url.replace('remote/secure/client-cert/hessian/publishingService','ws/harvest')

    thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..'))
    globus_base = None
    for service in thredds_file_services:
        if service[2] == 'Globus':
            globus_base = service[1]
    if globus_base is None:
        print 'No Globus file service specified in %s\n'\
              'Add Globus file service to the thredds_file_services variable in the form:\n'\
              '        Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\
              'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI']
        sys.exit(1)

    print '\n'\
          'ESGINI: %s\n'\
          'THREDDS root: %s\n'\
          'THREDDS url: %s\n'\
          'Globus service base: %s\n'\
          'ESGF harvesting service url: %s\n'\
          'X.509 user credential: %s\n'\
          '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile)

    if not globus_base.endswith('/'):
        print 'Globus service base must end with "/". Set Globus service base correctly in\n'\
              '%s end run the script again.' % os.environ['ESGINI']
        sys.exit(1)

    print 'The script recursively goes through xml files in %s\n'\
          'looking for datasets that were published without Globus file service and adds\n'\
          'Globus access to the datasets. If a dataset was published with Globus file\n'\
          'service configured, the script skips such a dataset leaving a corresponding xml\n'\
          'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\
          'to harvest the updated xml files. Because Hessian service requires SSL\n'\
          'authentication, the X.509 certificate, %s,\n'\
          'should be valid and obtained by a user who has the publisher role in all\n'\
          'projects.\n'\
          'It is strongly advised that you make a copy of the entire %s\n'\
          'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up)

    while True:
        sys.stdout.write("Do you want to continue? [y/N]")
        line = sys.stdin.readline().rstrip()
        if line == '' or line == 'n' or line == 'N':
            sys.exit(0)
        if line == 'y' or line == 'Y':
            break

    process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
Ejemplo n.º 40
0
def main():
    """Uses the esg.ini file options:
        - thredds_file_services
              to get a Globus endpoint UUID
        - thredds_root
              to find a directory with THREDDS xml catalogs
    """

    loadConfig(None)
    config = getConfig()
    if config is None:
        raise ESGPublishError('No configuration file found')

    # By default thredds_root is: /esg/content/thredds/esgcet
    thredds_root = config.get('DEFAULT', 'thredds_root')
    thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT',
                                                   'thredds_file_services')
    # parameters needed to re-harvest the THREDDS catalogs
    thredds_url = config.get('DEFAULT', 'thredds_url')
    hessian_service_certfile = config.get('DEFAULT',
                                          'hessian_service_certfile')
    hessian_service_url = config.get('DEFAULT', 'hessian_service_url')
    esgf_harvesting_service_url = hessian_service_url.replace(
        'remote/secure/client-cert/hessian/publishingService', 'ws/harvest')

    thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..'))
    globus_base = None
    for service in thredds_file_services:
        if service[2] == 'Globus':
            globus_base = service[1]
    if globus_base is None:
        print 'No Globus file service specified in %s\n'\
              'Add Globus file service to the thredds_file_services variable in the form:\n'\
              '        Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\
              'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI']
        sys.exit(1)

    print '\n'\
          'ESGINI: %s\n'\
          'THREDDS root: %s\n'\
          'THREDDS url: %s\n'\
          'Globus service base: %s\n'\
          'ESGF harvesting service url: %s\n'\
          'X.509 user credential: %s\n'\
          '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile)

    if not globus_base.endswith('/'):
        print 'Globus service base must end with "/". Set Globus service base correctly in\n'\
              '%s end run the script again.' % os.environ['ESGINI']
        sys.exit(1)

    print 'The script recursively goes through xml files in %s\n'\
          'looking for datasets that were published without Globus file service and adds\n'\
          'Globus access to the datasets. If a dataset was published with Globus file\n'\
          'service configured, the script skips such a dataset leaving a corresponding xml\n'\
          'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\
          'to harvest the updated xml files. Because Hessian service requires SSL\n'\
          'authentication, the X.509 certificate, %s,\n'\
          'should be valid and obtained by a user who has the publisher role in all\n'\
          'projects.\n'\
          'It is strongly advised that you make a copy of the entire %s\n'\
          'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up)

    while True:
        sys.stdout.write("Do you want to continue? [y/N]")
        line = sys.stdin.readline().rstrip()
        if line == '' or line == 'n' or line == 'N':
            sys.exit(0)
        if line == 'y' or line == 'Y':
            break

    process(thredds_root, thredds_root_up, globus_base, thredds_url,
            esgf_harvesting_service_url, hessian_service_certfile)
Ejemplo n.º 41
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Ejemplo n.º 42
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg,
                        properties, testProgress1=None, testProgress2=None, handlerDictionary=None, perVariable=None, keepVersion=False, newVersion=None,
                        extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False, nodbwrite=False,
                        pid_connector=None, test_publication=False, handlerExtraArgs={}, commitEvery=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    handlerExtraArgs={}
      A dictionary of extra keyword arguments to pass when instantiating the handler.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    commitEvery
        Integer specifying how frequently to commit file info to database when scanning files

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError("Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list.")

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline, **handlerExtraArgs)
        else:
            handler = getHandler(firstFile, Session, validate=True, **handlerExtraArgs)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)


        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension,
                                     offline=offline, operation=operation, progressCallback=testProgress1, perVariable=perVariable,
                                     keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway,
                                     comment=comment, useVersion=versionno, forceRescan=forceAggregate, nodbwrite=nodbwrite,
                                     pid_connector=pid_connector, test_publication=test_publication, commitEvery=commitEvery, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        # Turn off aggregations if skip_aggregations is set
        # This applies even if forceAggregate is set to True elsewhere in the
        # code when republishing an earlier version of the dataset
        section = 'project:%s' % context.get('project')
        config = getConfig()
        skipAggregate = config.getboolean(section, 'skip_aggregations', False)

        if runAggregate and skipAggregate:
            runAggregate = False
            info("Skipping aggregations due to skip_aggregations config option")

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Ejemplo n.º 43
0
    def validateContext(self, context):
        """
        Validate context values:

        - Mandatory values must be non-blank, and if enumerated have a valid value
        - If enumerated, non-mandatory values must be blank or have a valid value
        otherwise if enumerated the field must be either be blank or one of the valid values

        Raises ESGPublishError if a validation error occurs

        If the validate configuration option is set to False in the project section,
        validation always succeeds.
        """
        if not self.validate:
            return

        for key in context.keys():
            fieldType = self.getFieldType(key)

            # Ignore non-configured fields
            if fieldType is None:
                continue

            isenum = (fieldType == ENUM)
            if isenum:
                options = self.getFieldOptions(key)
            value = context[key]

            config = getConfig()

            project_section = 'project:%s' % self.name
            delimiter = config.get(project_section,
                                   key + "_delimiter",
                                   default="")

            if value in ['', None]:
                # if value not in default context, try to get it from key_pattern or *_map
                option = '%s_pattern' % key
                if config.has_option(project_section, option):
                    value = config.get(project_section, option, False, context)
                    context[key] = value
                elif config.has_option(project_section, 'maps'):
                    for map_option in splitLine(
                            config.get(project_section, 'maps', default=''),
                            ','):
                        from_keys, to_keys, value_dict = splitMap(
                            config.get(project_section, map_option))
                        if key in to_keys:
                            from_values = tuple(context[k] for k in from_keys)
                            to_values = value_dict[from_values]
                            value = to_values[to_keys.index(key)]
                            context[key] = value

            if self.isMandatory(key):
                if value in ['', None]:
                    if isenum:
                        raise ESGInvalidMandatoryField(
                            "Mandatory field '%s' not set, must be one of %s" %
                            (key, ` options `))
                    else:
                        raise ESGInvalidMandatoryField(
                            "Mandatory field '%s' not set" % key)
                elif isenum and not self.compareEnumeratedValue(
                        value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGInvalidMandatoryField(
                        "Invalid value of mandatory field '%s': %s, must be one of %s"
                        % (key, value, ` validOptions `))
            elif isenum:  # non-mandatory field
                options += ['', None]
                if not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGPublishError(
                        "Invalid value of '%s': %s, must be one of %s" %
                        (key, value, ` validOptions `))
Ejemplo n.º 44
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        if self.replica:
            debug("skipping PrePARE for replica (file %s)" % f)
            return

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0")
        min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0")
        data_specs_version = config.get(project_config_section, "data_specs_version", default="master")
        cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH)

        try:
            file_cmor_version = fileobj.getAttribute('cmor_version', None)
        except:
            file_cmor_version = None
            debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f)

        passed_cmor = False
        if compareLibVersions(min_cmor_version, file_cmor_version):
            debug('File %s cmor-ized at version %s, passed!"'%(f, file_cmor_version))
            passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError("File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError("File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute('data_specs_version', None)
        except Exception as e:
            raise ESGPublishError("File %s missing required data_specs_version global attribute"%f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if passed_cmor:
            return
            
        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        checkAndUpdateRepo(cmor_table_path, data_specs_version)

        try:
            process = validator.checkCMIP6(cmor_table_path)
            if process is None:
                raise ESGPublishError("File %s failed the CV check - object create failure"%f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check"%f)
Ejemplo n.º 45
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset)
                    except RemoteCallException, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s"%stateName)
                    resultDict[datasetName] = eventName
            else:                       # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset)
                except RemoteCallException, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                    continue
Ejemplo n.º 46
0
    def initializeFields(self, Session):
        """Initialize field names and options based on the configuration file."""
        from esgcet.model import Model, Experiment
        config = getConfig()
        projectSection = 'project:' + self.name
        categoryOption = config.get(projectSection, 'categories')
        categorySpecs = splitRecord(categoryOption)
        for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs:
            categoryType = getCategoryType(categoryTypeS)
            isMandatory = getBoolean(isMandatoryS)
            isThreddsProperty = getBoolean(isThreddsPropertyS)
            displayOrder = string.atoi(displayOrderS)
            self.fieldNames[category] = (categoryType, isMandatory,
                                         isThreddsProperty, displayOrder)

        categoryDefaultsOption = config.get(projectSection,
                                            'category_defaults',
                                            default=None,
                                            raw=True)
        if categoryDefaultsOption is not None:
            categoryDefaultsSpecs = splitRecord(categoryDefaultsOption)
            for category, categoryDefault in categoryDefaultsSpecs:
                self.categoryDefaults[category] = categoryDefault

        session = Session()

        # Find any new experiments. This allows experiments to be added to the config file without
        # running esginitialize.
        if self.fieldNames.has_key('experiment') and self.fieldNames[
                'experiment'][WIDGET_TYPE] == ENUM:
            initializeExperiments(config, self.name, session)

        for category in self.getFieldNames():
            # At the moment some fields are predefined
            if category == "project":
                projects = splitRecord(
                    config.get(projectSection, 'project_options', default=''))
                self.validValues['project'] = [x[0] for x in projects]
            elif category == "model":
                models = session.query(Model).filter_by(
                    project=self.name).all()
                self.validValues['model'] = [x.name for x in models]
            elif category == "experiment":
                experiments = session.query(Experiment).filter_by(
                    project=self.name).all()
                self.validValues['experiment'] = [x.name for x in experiments]
            elif category == "creator":
                creators = splitRecord(
                    config.get(projectSection, 'creator_options', default=''))
                self.validValues['creator'] = [x[0] for x in creators]
                self.validMaps['creator'] = genMap(creators)
            elif category == "publisher":
                publishers = splitRecord(
                    config.get(projectSection, 'publisher_options',
                               default=''))
                self.validValues['publisher'] = [x[0] for x in publishers]
                self.validMaps['publisher'] = genMap(publishers)
            else:
                categoryType = self.getFieldType(category)
                if categoryType == ENUM:
                    option = category + "_options"
                    self.validValues[category] = splitLine(
                        config.get(projectSection, option), ',')

            self.context[category] = ''

        session.close()
Ejemplo n.º 47
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False
        
    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if keepVersion:
        if existingVersion<=0:
            newVersion = getInitialDatasetVersion(versionByDate)
        else:
            newVersion = existingVersion
    elif newVersion is None:
        newVersion = getNextDatasetVersion(existingVersion, versionByDate)
        
    dset.reaggregate = False
    # Add a new version
    if addNewVersion and newVersion>existingVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title
        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle)
        info("New dataset version = %d"%newDsetVersionObj.version)
        for var in dset.variables:
            session.delete(var)
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset
Ejemplo n.º 48
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:' + self.name
        datasetIdFormatList = config.get(section,
                                         'dataset_id',
                                         raw=True,
                                         default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())

            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions(
                'experiment', experimentOptions)
            if idFormat.find(
                    '%(experiment)s') != -1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x, y: x + '|' + y,
                                      experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)' % optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)',
                                              experimentPattern)

            if newinit[-1] != '$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value != context[key]:
                    warning("Dataset ID=%s, but %s=%s" %
                            (datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning(
                "Dataset ID: %s does not match the dataset_id format(s): %s" %
                (datasetName, ` datasetIdFormats `))

        return context
Ejemplo n.º 49
0
    def generateDirectoryMap(self,
                             directoryList,
                             filefilt,
                             initContext=None,
                             datasetName=None,
                             use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters

        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:' + self.name
            dataset_id_formats = splitLine(
                config.get(section, 'dataset_id', raw=True))
            idfields = [
                re.findall(_patpat, format) for format in dataset_id_formats
            ]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1] == '/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId(
                            'dataset_id',
                            idfields,
                            groupdict,
                            multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$',
                                            drsversion[0]):  # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s' % drsversion
                    except:
                        allfields = reduce(lambda x, y: set(x) + set(y),
                                           idfields)
                        missingFields = list((set(allfields) -
                                              set(groupdict.keys())) -
                                             set(config.options(section)))
                        raise ESGPublishError(
                            "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"
                            % ( ` missingFields `, nodepath))
                else:
                    warning(
                        "Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini"
                    )
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0):
            warning(
                "Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini"
            )
        return datasetMap
Ejemplo n.º 50
0
    def __init__(self, url, certFile, certs_location, keyFile=None, debug=False):
        """

        Create a RESTful ESGF Publication Service proxy. The proxy
        supports both the current and legacy publication services.

        The current API is defined at:
        http://esgf.org/wiki/ESGF_Publishing_Services

        See http://esgf.org/esg-search-site/LegacyPublishingService.html
        for definition of the legacy API.

        url
          Publication service URL. For example, 'https://pcmdi9.llnl.gov/esg-search/ws/'.
          Note that the actual service operation will be appended to the URL.

        certFile
          Client certificate file in PEM format, for client cert authentication.

        keyfile
          Client key file, if different from certFile.

        debug:
          Boolean flag. If True, write debugging information.
        """

        self.service_type = 'REST'
        self.url = url
        if self.url[-1] != '/':
            self.url += '/'
        self.harvestUrl = urljoin(self.url, 'harvest')
        self.deleteUrl = urljoin(self.url, 'delete')
        self.retractUrl = urljoin(self.url, 'retract')

        self.certFile = certFile
        if keyFile is not None:
            self.keyFile = keyFile
        else:
            self.keyFile = certFile
        outdir=os.path.dirname(certFile)
        concat_certs=outdir+'/concatenatedcerts'

        # need to concatenate the certs bundle to the cert to use as the CA context.  Thanks pchengi for the initial fix!
        # check if there is a setting, if none, use a default

        config = getConfig()
        certs_bundle_location = DEFAULT_CERTS_BUNDLE
        try:
            certs_bundle_location = config.get('DEFAULT', 'esg_certificates_bundle')
        except:
            certs_bundle_location = DEFAULT_CERTS_BUNDLE
        

        files=[certFile,certs_bundle_location]
        with open(concat_certs,'w') as outfile:
            for certf in files:
                with open(certf, 'r') as file:
                    outfile.write(file.read())
                    outfile.write('\n')
        self.certs_location = concat_certs
        self.debug = debug
        self.status = 0
        self.message = ''
Ejemplo n.º 51
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:'+self.name
        datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())
            
            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions)
            if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)'%optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern)
            
            if newinit[-1]!='$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value!=context[key]:
                    warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`))

        return context
Ejemplo n.º 52
0
    def validateContext(self, context):
        """
        Validate context values:

        - Mandatory values must be non-blank, and if enumerated have a valid value
        - If enumerated, non-mandatory values must be blank or have a valid value
        otherwise if enumerated the field must be either be blank or one of the valid values

        Raises ESGPublishError if a validation error occurs

        If the validate configuration option is set to False in the project section,
        validation always succeeds.
        """
        if not self.validate:
            return
        
        for key in context.keys():
            fieldType = self.getFieldType(key)

            # Ignore non-configured fields
            if fieldType is None:
                continue
            
            isenum = (fieldType==ENUM)
            if isenum:
                options = self.getFieldOptions(key)
            value = context[key]

            config = getConfig()

            project_section = 'project:%s' % self.name
            delimiter = config.get(project_section, key + "_delimiter", default="")

            if value in ['', None]:
                # if value not in default context, try to get it from key_pattern or *_map
                option = '%s_pattern' % key
                if config.has_option(project_section, option):
                    value = config.get(project_section, option, False, context)
                    context[key] = value
                elif config.has_option(project_section, 'maps'):
                    for map_option in splitLine(config.get(project_section, 'maps', default=''), ','):
                        from_keys, to_keys, value_dict = splitMap(config.get(project_section, map_option))
                        if key in to_keys:
                            from_values = tuple(context[k] for k in from_keys)
                            to_values = value_dict[from_values]
                            value = to_values[to_keys.index(key)]
                            context[key] = value

            if self.isMandatory(key):
                if value in ['', None]:
                    if isenum:
                        raise ESGInvalidMandatoryField("Mandatory field '%s' not set, must be one of %s"%(key, `options`))
                    else:
                        raise ESGInvalidMandatoryField("Mandatory field '%s' not set"%key)
                elif isenum and not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGInvalidMandatoryField("Invalid value of mandatory field '%s': %s, must be one of %s"%(key, value, `validOptions`))
            elif isenum:     # non-mandatory field
                options += ['', None]
                if not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGPublishError("Invalid value of '%s': %s, must be one of %s"%(key, value, `validOptions`))
Ejemplo n.º 53
0
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)

    if publish:

        # Create the web service proxy
        config = getConfig()
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            servicePollingDelay = config.getfloat('DEFAULT','hessian_service_polling_delay')
            spi = servicePollingIterations = config.getint('DEFAULT','hessian_service_polling_iterations')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:                   # REST service
            spi = 1
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)
Ejemplo n.º 54
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0")
        min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0")
        data_specs_version = config.get(project_config_section, "data_specs_version", default="master")
        cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH)
        force_validation = config.getboolean(project_config_section, "force_validation", default=False)
        cmor_table_subdirs = config.getboolean(project_config_section, "cmor_table_subdirs", default=False)

        if not force_validation:

            if self.replica:
                info("skipping PrePARE for replica (file %s)" % f)
                return

            try:
                file_cmor_version = fileobj.getAttribute('cmor_version', None)
            except:
                file_cmor_version = None
                debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f)

            passed_cmor = False
            if compareLibVersions(min_cmor_version, file_cmor_version):
                debug('File %s cmor-ized at version %s, passed!'%(f, file_cmor_version))
                passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError("File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError("File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute('data_specs_version', None)
        except Exception as e:
            raise ESGPublishError("File %s missing required data_specs_version global attribute"%f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if (not force_validation) and passed_cmor:
            return
            
        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        table_dir = getTableDir(cmor_table_path, data_specs_version, cmor_table_subdirs)
        debug("Validating {} using tables dir: {}".format(f, table_dir))

        try:
            process = validator.checkCMIP6(table_dir)
            if process is None:
                raise ESGPublishError("File %s failed the CV check - object create failure"%f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check"%f)
Ejemplo n.º 55
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        config = getConfig()
        projectSection = 'project:' + self.name
        min_cmor_version = config.get(projectSection,
                                      "min_cmor_version",
                                      default="0.0.0")

        file_cmor_version = "0.0.0"

        try:
            file_cmor_version = fileobj.getAttribute('cmor_version', None)
        except:
            debug(
                'File %s missing cmor_version attribute; will proceed with PrePARE check'
                % f)

        if compareLibVersions(min_cmor_version, file_cmor_version):
            debug('File %s cmor-ized at version %s, passed!"' %
                  (f, file_cmor_version))
            return

            #  PrePARE is going to handle the CF check now
        # min_cf_version = config.get(projectSection, "min_cf_version", defaut="")

        # if len(min_cf_version) == 0:
        #     raise ESGPublishError("Minimum CF version not set in esg.ini")

        # fakeversion = ["cfchecker.py", "-v", min_cf_version
        # , "foo"]
        # (badc,coards,uploader,useFileName,standardName,areaTypes,udunitsDat,version,files)=getargs(fakeversion)
        # CF_Chk_obj = CFChecker(uploader=uploader, useFileName=useFileName, badc=badc, coards=coards, cfStandardNamesXML=standardName, cfAreaTypesXML=areaTypes, udunitsDat=udunitsDat, version=version)
        # rc = CF_Chk_obj.checker(f)

        # if (rc > 0):
        #     raise ESGPublishError("File %s fails CF check"%f)

        file_data_specs_version = None
        try:
            file_data_specs_version = fileobj.getAttribute(
                'data_specs_version', None)
        except Exception as e:
            raise ESGPublishError(
                "File %s missing required data_specs_version global attribute"
                % f)

        table = None
        try:
            table = fileobj.getAttribute('table_id', None)

        except:
            raise ESGPublishError(
                "File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)

        except:
            raise ESGPublishError(
                "File %s missing required variable_id global attribute" % f)

        project_section = 'config:cmip6'

        cmor_table_path = ""
        try:
            cmor_table_path = config.get(projectSection,
                                         "cmor_table_path",
                                         defaut="")
        except:
            debug("Missing cmor_table_path setting. Using default location")

        if cmor_table_path == "":
            cmor_table_path = DEFAULT_CMOR_TABLE_PATH

        checkAndUpdateRepo(cmor_table_path, self, file_data_specs_version)

        table_file = cmor_table_path + '/CMIP6_' + table + '.json'
        fakeargs = ['--variable', variable_id, table_file, f]
        parser = argparse.ArgumentParser(prog='esgpublisher')
        parser.add_argument('--variable')
        parser.add_argument('cmip6_table', action=validator.JSONAction)
        parser.add_argument('infile', action=validator.CDMSAction)
        parser.add_argument('outfile',
                            nargs='?',
                            help='Output file (default stdout)',
                            type=argparse.FileType('w'),
                            default=sys.stdout)
        args = parser.parse_args(fakeargs)

        #        print "About to CV check:", f

        try:
            process = validator.checkCMIP6(args)
            if process is None:
                raise ESGPublishError(
                    "File %s failed the CV check - object create failure" % f)

            process.ControlVocab()

        except:

            raise ESGPublishError("File %s failed the CV check" % f)
Ejemplo n.º 56
0
    def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters
        
        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:'+self.name
            dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True))
            idfields = [re.findall(_patpat, format) for format in dataset_id_formats]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1]=='/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s'%drsversion
                    except:
                        allfields = reduce(lambda x,y: set(x)+set(y), idfields)
                        missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section)))
                        raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath))
                else:
                    warning("Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini")
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0 ):
            warning("Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini")
        return datasetMap
Ejemplo n.º 57
0
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)

    if publish:

        # Create the web service proxy
        config = getConfig()
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            servicePollingDelay = config.getfloat('DEFAULT','hessian_service_polling_delay')
            spi = servicePollingIterations = config.getint('DEFAULT','hessian_service_polling_iterations')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:                   # REST service
            spi = 1
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)
Ejemplo n.º 58
0
    def __init__(self,
                 url,
                 certFile,
                 certs_location,
                 keyFile=None,
                 debug=False):
        """

        Create a RESTful ESGF Publication Service proxy. The proxy
        supports both the current and legacy publication services.

        The current API is defined at:
        http://esgf.org/wiki/ESGF_Publishing_Services

        See http://esgf.org/esg-search-site/LegacyPublishingService.html
        for definition of the legacy API.

        url
          Publication service URL. For example, 'https://pcmdi9.llnl.gov/esg-search/ws/'.
          Note that the actual service operation will be appended to the URL.

        certFile
          Client certificate file in PEM format, for client cert authentication.

        keyfile
          Client key file, if different from certFile.

        debug:
          Boolean flag. If True, write debugging information.
        """

        self.service_type = 'REST'
        self.url = url
        if self.url[-1] != '/':
            self.url += '/'
        self.harvestUrl = urljoin(self.url, 'harvest')
        self.deleteUrl = urljoin(self.url, 'delete')
        self.retractUrl = urljoin(self.url, 'retract')

        self.certFile = certFile
        if keyFile is not None:
            self.keyFile = keyFile
        else:
            self.keyFile = certFile
        outdir = os.path.dirname(certFile)
        concat_certs = outdir + '/concatenatedcerts'

        # need to concatenate the certs bundle to the cert to use as the CA context.  Thanks pchengi for the initial fix!
        # check if there is a setting, if none, use a default

        config = getConfig()
        certs_bundle_location = DEFAULT_CERTS_BUNDLE
        try:
            certs_bundle_location = config.get('DEFAULT',
                                               'esg_certificates_bundle')
        except:
            certs_bundle_location = DEFAULT_CERTS_BUNDLE

        files = [certs_bundle_location, certFile]
        with open(concat_certs, 'w') as outfile:
            for certf in files:
                with open(certf, 'r') as file:
                    outfile.write(file.read())
                    outfile.write('\n')
        self.certs_location = concat_certs
        self.debug = debug
        self.status = 0
        self.message = ''
Ejemplo n.º 59
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            service_certs_location = getServiceCertsLoc()
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                if service.service_type == 'REST':
                    error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName)
                    continue
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName