Example #1
0
def checksum(path, client):
    """
    Calculate a file checksum.

    Returns the String checksum.

    path
      String pathname.

    client
      String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path.
    """

    if not os.path.exists(path):
        raise ESGPublishError("No such file: %s"%path)

    command = "%s %s"%(client, path)
    info("Running: %s"%command)

    try:
        f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout
    except:
        error("Error running command '%s %s', check configuration option 'checksum'."%command)
    lines = f.readlines()
    csum = lines[0].split()[0]

    return csum
Example #2
0
def initializeExperiments(config, projectName, session):
    from esgcet.model import Experiment, Project

    projectSection = 'project:' + projectName
    experimentOption = config.get(projectSection, 'experiment_options')
    experimentSpecs = splitRecord(experimentOption)
    try:
        for projectId, experimentName, experimentDesc in experimentSpecs:
            if projectId != projectName:
                continue

            # Check if the experiment exists
            experiment = session.query(Experiment).filter_by(
                name=experimentName, project=projectName).first()
            if experiment is None:
                info("Adding experiment %s for project %s" %
                     (experimentName, projectName))
                experiment = Experiment(experimentName, projectName,
                                        experimentDesc)
                project = session.query(Project).filter_by(
                    name=projectName).first()
                if project is None:
                    raise ESGPublishError('No such project: %s' % projectName)
                project.experiments.append(experiment)
                session.add(experiment)
                session.commit()
    except ValueError:
        raise ESGPublishError(
            'experiment_options is misconfigured in section %s: %s' %
            (projectSection, experimentOption))
    def threddsIsValidVariableFilePair(self, variable, fileobj):
        """Returns True iff the variable and file should be published
        to a per-variable THREDDS catalog for this project.

        variable
          A Variable instance.

        fileobj
          A File instance.
        """
        # Require that the variable short name match the portion
        # of the file basename preceding the first underscore.
        try:
            if self.checkFilenames:
                shortname = variable.short_name
                path = fileobj.getLocation()
                basename = os.path.basename(path)
                pathshortname = basename.split('_')[0]
                result = (shortname == pathshortname)
            else:
                result = True
        except:
            result = True
        if not result:
            info("Skipping variable %s (in file %s)"%(variable.short_name, fileobj.getLocation()))
        return result
Example #4
0
def checksum(path, client):
    """
    Calculate a file checksum.

    Returns the String checksum.

    path
      String pathname.

    client
      String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path.
    """

    if not os.path.exists(path):
        raise ESGPublishError("No such file: %s" % path)

    command = "%s %s" % (client, path)
    info("Running: %s" % command)

    try:
        f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout
    except:
        error(
            "Error running command '%s %s', check configuration option 'checksum'."
            % command)
    lines = f.readlines()
    csum = lines[0].split()[0]

    return csum
Example #5
0
    def threddsIsValidVariableFilePair(self, variable, fileobj):
        """Returns True iff the variable and file should be published
        to a per-variable THREDDS catalog for this project.

        variable
          A Variable instance.

        fileobj
          A File instance.
        """
        # Require that the variable short name match the portion
        # of the file basename preceding the first underscore.
        try:
            if self.checkFilenames:
                shortname = variable.short_name
                path = fileobj.getLocation()
                basename = os.path.basename(path)
                pathshortname = basename.split('_')[0]
                result = (shortname == pathshortname)
            else:
                result = True
        except:
            result = True
        if not result:
            info("Skipping variable %s (in file %s)" %
                 (variable.short_name, fileobj.getLocation()))
        return result
Example #6
0
def renameFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, **context):

    info("Renaming files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    basedict = dset.getBaseDictionary()

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    for path, size in pathlist:

        # If the file exists, rename it
        oldpath = None
        if extraFields is not None:
            oldpath = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if oldpath is None:
            info("No from_file field for file %s, skipping"%path)
            continue

        if locdict.has_key(oldpath):
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            if not os.path.exists(path):
                info("File not found: %s, skipping"%path)
                continue
            info("Renaming %s to %s"%(oldpath, path))
            del basedict[fileObj.base]
            base = generateFileBase(path, basedict, dset.name)
            fileObj.base = base
            basedict[base] = 1
            fileVersionObj.location = path
            del locdict[oldpath]
            locdict[path] = fileVersionObj
        else:
            info("File entry %s not found, skipping"%oldpath)
            continue

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return False
Example #7
0
def renameFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, **context):

    info("Renaming files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    basedict = dset.getBaseDictionary()

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    for path, size in pathlist:

        # If the file exists, rename it
        oldpath = None
        if extraFields is not None:
            oldpath = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if oldpath is None:
            info("No from_file field for file %s, skipping"%path)
            continue

        if locdict.has_key(oldpath):
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            if not os.path.exists(path):
                info("File not found: %s, skipping"%path)
                continue
            info("Renaming %s to %s"%(oldpath, path))
            del basedict[fileObj.base]
            base = generateFileBase(path, basedict, dset.name)
            fileObj.base = base
            basedict[base] = 1
            fileVersionObj.location = path
            del locdict[oldpath]
            locdict[path] = fileVersionObj
        else:
            info("File entry %s not found, skipping"%oldpath)
            continue

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return False
Example #8
0
    def __invoke(self, method, params):
	# call a method on the remote server

	request = HessianWriter().write_call(method, params)

        #----------------------------------------------------------------------
        # Patch for HTTP proxy support starts here.  [email protected]
        #
	import httplib, os, urlparse, ssl

        if self._scheme=="http":
            proxy_url = os.environ.get('http_proxy')
            if proxy_url is not None:
                if DEBUG:
                    messaging.info('Proxy detected at %s' % proxy_url)
                proxy_parts = urlparse.urlparse(proxy_url)
                proxy_host = proxy_parts.hostname
                proxy_port = proxy_parts.port
                if proxy_port is None:
                    proxy_port = 80
                h = httplib.HTTPConnection(proxy_host, port=proxy_port)
            else:
                h = httplib.HTTPConnection(self._host, port=self._port)
        else:
            ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
            h = httplib.HTTPSConnection(self._host, port=self._port, key_file=self._key_file, cert_file=self._cert_file, context=ctx)


        req_headers = {'Host': self._host,
                       'User-Agent': "hessianlib.py/%s" % __version__,
                       'Content-Length': str(len(request)),
                       }

        if DEBUG:
            messaging.info('Sending request: %s'%`request`)
        h.request("POST", self._url, request, req_headers)
        #
        # End Patch from [email protected]
        #----------------------------------------------------------------------

        response = h.getresponse()
        headers = response.getheaders()
        errcode = response.status
        errmsg = response.reason
	# errcode, errmsg, headers = h.getreply()

	if errcode != 200:
	    raise ProtocolError(self._url, errcode, errmsg, headers)

	# return self.parse_response(h.getfile())
        if DEBUG:
            messaging.info('Got response:')
        responseProxy = ResponseProxy(response)
        return self.parse_response(responseProxy)
def calc_checksum(filepath, checksum_client):
    csum = None
    if os.path.exists(filepath):
        command = "%s %s"%(checksum_client, filepath)
        info("Running: %s"%command)
        try:
            f = os.popen(command).read()
            csum = f.split(' ')[0]
        except:
            error("Error running command '%s', check configuration option 'checksum'."%command)

    return filepath, csum
Example #10
0
def deleteGatewayDatasetVersion(versionName,
                                gatewayOperation,
                                service,
                                session,
                                dset=None):
    """
    Delete a dataset version from the gateway.

    Returns (*event_name*, *state_name*) where *event_name* is the
    associated event, such as ``esgcet.model.DELETE_GATEWAY_DATASET_EVENT``,
    and *state_name* is 'SUCCESSFUL' or 'UNSUCCESSFUL'

    versionName
      String dataset identifier (foo.bar.vN).

    gatewayOperation
      DELETE or UNPUBLISH

    service
      Hessian proxy web service

    session
      A database Session **instance**.

    dset
      Parent dataset of the version. If None, don't record the deletion event.

    """

    # Clear publication errors from dataset_status
    if dset is not None:
        dset.clear_warnings(session, PUBLISH_MODULE)

    if gatewayOperation == DELETE:
        successEvent = DELETE_GATEWAY_DATASET_EVENT
        failureEvent = DELETE_DATASET_FAILED_EVENT
    else:
        successEvent = UNPUBLISH_GATEWAY_DATASET_EVENT
        failureEvent = UNPUBLISH_DATASET_FAILED_EVENT

    # Delete
    try:
        if gatewayOperation == DELETE:
            info("Deleting %s" % versionName)
            service.deleteDataset(versionName, True, 'Deleting dataset')
        else:
            info("Retracting %s" % versionName)
            service.retractDataset(versionName, 'Retracting dataset')
    except socket.error, e:
        raise ESGPublishError(
            "Socket error: %s\nIs the proxy certificate %s valid?" %
            ( ` e `, service._cert_file))
Example #11
0
   def evt_save_dataset_page(self, datasetName):
       import types
       from esgcet.messaging import debug, info
       new_settings = {}
       for x in self.field_list.keys():
           field_value = self.field_list[ x ].get()
           if type(field_value) == types.UnicodeType:
              field_value = str( field_value )[:-1]
           new_settings[ x ] = field_value

       del new_settings[ 'id' ]    # Don't reset the 'id'
       updateDatasetFromContext(new_settings, datasetName, self.Session)

       info("The changes have been accepted for the dataset: %s." % datasetName)
Example #12
0
def registerHandlers(project=None):
    """Read the project handlers from the init file handler parameters, and add to the registry."""

    # Get the project names
    projectOption = config.get('initialize', 'project_options')
    projectSpecs = splitRecord(projectOption)
    projectRegistry = getRegistry(ESGCET_PROJECT_HANDLER_GROUP)
    formatRegistry = getRegistry(ESGCET_FORMAT_HANDLER_GROUP)
    metadataRegistry = getRegistry(ESGCET_METADATA_HANDLER_GROUP)
    threddsRegistry = getRegistry(ESGCET_THREDDS_CATALOG_HOOK_GROUP)

    for projectName, projectDesc, search_order in projectSpecs:
        # process only the given project
        if (project is not None) and (projectName != project):
            continue

        # For each project: get the handler
        handler = config.get('project:'+projectName, HANDLER_OPTION, default=None)
        handlerName = config.get('project:'+projectName, PROJECT_NAME_OPTION, default=None)

        # Get the handler class and register it
        if handlerName is not None:
            registerHandlerName(projectRegistry, projectName, handlerName)
            setRegisterSearchOrder(projectName, search_order)
        elif handler is not None:
            m, cls = handler.split(':')
            register(projectRegistry, projectName, m, cls)
            setRegisterSearchOrder(projectName, search_order)
        else:
            info("No project handler spec found for project %s"%projectName)

        # Get the format handler class and register it
        formatHandlerName = config.get('project:'+projectName, FORMAT_NAME_OPTION, default=None)
        if formatHandlerName is not None:
            registerHandlerName(formatRegistry, projectName, formatHandlerName)
        else:
            registerHandlerName(formatRegistry, projectName, DEFAULT_FORMAT_NAME_OPTION)

        # Get the metadata handler class and register it
        metadataHandlerName = config.get('project:'+projectName, METADATA_NAME_OPTION, default=None)
        if metadataHandlerName is not None:
            registerHandlerName(metadataRegistry, projectName, metadataHandlerName)
        else:
            registerHandlerName(metadataRegistry, projectName, DEFAULT_METADATA_NAME_OPTION)

        # Get the thredds catalog hook if any
        threddsCatalogHookName = config.get('project:'+projectName, THREDDS_CATALOG_HOOK_OPTION, default=None)
        if threddsCatalogHookName is not None:
            registerHandlerName(threddsRegistry, projectName, threddsCatalogHookName)
    def evt_save_dataset_page(self, datasetName):
        import types
        from esgcet.messaging import debug, info
        new_settings = {}
        for x in self.field_list.keys():
            field_value = self.field_list[x].get()
            if type(field_value) == types.UnicodeType:
                field_value = str(field_value)[:-1]
            new_settings[x] = field_value

        del new_settings['id']  # Don't reset the 'id'
        updateDatasetFromContext(new_settings, datasetName, self.Session)

        info("The changes have been accepted for the dataset: %s." %
             datasetName)
Example #14
0
def deleteGatewayDatasetVersion(versionName, gatewayOperation, service, session, dset=None):
    """
    Delete a dataset version from the gateway.

    Returns (*event_name*, *state_name*) where *event_name* is the
    associated event, such as ``esgcet.model.DELETE_GATEWAY_DATASET_EVENT``,
    and *state_name* is 'SUCCESSFUL' or 'UNSUCCESSFUL'

    versionName
      String dataset identifier (foo.bar.vN).

    gatewayOperation
      DELETE or UNPUBLISH

    service
      Hessian proxy web service

    session
      A database Session **instance**.

    dset
      Parent dataset of the version. If None, don't record the deletion event.

    """

    # Clear publication errors from dataset_status
    if dset is not None:
        dset.clear_warnings(session, PUBLISH_MODULE)

    if gatewayOperation==DELETE:
        successEvent = DELETE_GATEWAY_DATASET_EVENT
        failureEvent = DELETE_DATASET_FAILED_EVENT
    else:
        successEvent = UNPUBLISH_GATEWAY_DATASET_EVENT
        failureEvent = UNPUBLISH_DATASET_FAILED_EVENT

    # Delete
    try:
        if gatewayOperation==DELETE:
            info("Deleting %s"%versionName)
            service.deleteDataset(versionName, True, 'Deleting dataset')
        else:
            info("Retracting %s"%versionName)
            service.retractDataset(versionName, 'Retracting dataset')
    except socket.error, e:
        raise ESGPublishError("Socket error: %s\nIs the proxy certificate %s valid?"%(`e`, service._cert_file))
Example #15
0
def reinitializeLAS():
    """
    Reinitialize the Live Access Server. This forces the catalogs to be reread.

    Returns the HTML string returned from the URL.

    """
    config = getConfig()
    if config is None:
        raise ESGPublishError("No configuration file found.")

    lasReinitUrl = config.get("DEFAULT", "las_reinit_url")
    info("Reinitializing LAS server")

    try:
        reinitResult = readThreddsWithAuthentication(lasReinitUrl, config)
    except Exception, e:
        raise ESGPublishError("Error reinitializing the Live Access Server: %s" % e)
Example #16
0
def registerHandlers():
    """Read the project handlers from the init file handler parameters, and add to the registry."""
    # Get the project names
    projectOption = config.get('initialize', 'project_options')
    projectSpecs = splitRecord(projectOption)
    projectRegistry = getRegistry(ESGCET_PROJECT_HANDLER_GROUP)
    formatRegistry = getRegistry(ESGCET_FORMAT_HANDLER_GROUP)
    metadataRegistry = getRegistry(ESGCET_METADATA_HANDLER_GROUP)
    threddsRegistry = getRegistry(ESGCET_THREDDS_CATALOG_HOOK_GROUP)
    for projectName, projectDesc, search_order in projectSpecs:

        # For each project: get the handler
        handler = config.get('project:'+projectName, HANDLER_OPTION, default=None)
        handlerName = config.get('project:'+projectName, PROJECT_NAME_OPTION, default=None)
        
        # Get the handler class and register it
        if handlerName is not None:
            registerHandlerName(projectRegistry, projectName, handlerName)
            setRegisterSearchOrder(projectName, search_order)
        elif handler is not None:
            m, cls = handler.split(':')
            register(projectRegistry, projectName, m, cls)
            setRegisterSearchOrder(projectName, search_order)
        else:
            info("No project handler spec found for project %s"%projectName)

        # Get the format handler class and register it
        formatHandlerName = config.get('project:'+projectName, FORMAT_NAME_OPTION, default=None)
        if formatHandlerName is not None:
            registerHandlerName(formatRegistry, projectName, formatHandlerName)
        else:
            registerHandlerName(formatRegistry, projectName, DEFAULT_FORMAT_NAME_OPTION)

        # Get the metadata handler class and register it
        metadataHandlerName = config.get('project:'+projectName, METADATA_NAME_OPTION, default=None)
        if metadataHandlerName is not None:
            registerHandlerName(metadataRegistry, projectName, metadataHandlerName)
        else:
            registerHandlerName(metadataRegistry, projectName, DEFAULT_METADATA_NAME_OPTION)

        # Get the thredds catalog hook if any
        threddsCatalogHookName = config.get('project:'+projectName, THREDDS_CATALOG_HOOK_OPTION, default=None)
        if threddsCatalogHookName is not None:
            registerHandlerName(threddsRegistry, projectName, threddsCatalogHookName)
Example #17
0
def reinitializeLAS():
    """
    Reinitialize the Live Access Server. This forces the catalogs to be reread.

    Returns the HTML string returned from the URL.

    """
    config = getConfig()
    if config is None:
        raise ESGPublishError("No configuration file found.")

    lasReinitUrl = config.get('DEFAULT', 'las_reinit_url')
    info("Reinitializing LAS server")

    try:
        reinitResult = readThreddsWithAuthentication(lasReinitUrl, config)
    except Exception, e:
        raise ESGPublishError(
            "Error reinitializing the Live Access Server: %s" % e)
def validateDRSFieldValues(context, cdfile):
    """DRS fields must be formed from characters a-z,A-Z,0-9,-
    
    context: dictionary of context values to be validated
    cdfile: CdunifFormatHandler instance

    Returns the context with any sequence of invalid characters (for a DRS field) mapped to '-'.
    
    For example, 'NOAA  GFDL' is mapped to 'NOAA-GFDL'.
    """

    for key in context.keys():
        if isDRSField(key):
            value = context[key]
            if drsInvalidValues.search(value) is not None:
                result = drsInvalidValues.sub('-', value)
                info('Mapped invalid %s value: %s to %s, file: %s'%(key, value, result, cdfile.path))
                context[key] = result

    return context
Example #19
0
def deleteFilesVersion(
    dset,
    dsetVersion,
    pathlist,
    session,
    cfHandler,
    configOptions,
    aggregateDimensionName=None,
    offline=False,
    progressCallback=None,
    stopEvent=None,
    extraFields=None,
    **context
):

    info("Deleting file entries for dataset: %s, version %d" % (dset.name, dsetVersion.version))

    haveLatestDsetVersion = dsetVersion.version == dset.getVersion()

    # Create a file dictionary for the dataset
    fobjdict = {}  # file version objects for the new dataset version
    for fobj in dsetVersion.getFileVersions():
        fobjdict[fobj.location] = fobj

    nfiles = len(pathlist)

    varlocate = configOptions["variable_locate"]
    seq = 0
    addNewDatasetVersion = False
    for path, size in pathlist:

        # If the file exists in the dataset, delete the file children (with cascade), and the file
        if fobjdict.has_key(path):
            fileVersionObj = fobjdict[path]
            info("Deleting entry for file %s" % path)

            # If this is the latest dataset version, remove the file variables and reaggregate ...
            if haveLatestDsetVersion:
                fileVersionObj.parent.deleteChildren(session)
                addNewDatasetVersion = True

            # ... otherwise just delete the membership of the file version in the dataset version
            else:
                fileVersionObj.deleteChildren(session)
                session.commit()
            del fobjdict[path]
        else:
            info("File entry not found: %s, skipping" % path)

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return addNewDatasetVersion, fobjdict.values()
Example #20
0
def initializeExperiments(config, projectName, session):
    from esgcet.model import Experiment, Project

    projectSection = 'project:'+projectName
    experimentOption = config.get(projectSection, 'experiment_options')
    experimentSpecs = splitRecord(experimentOption)
    try:
        for projectId, experimentName, experimentDesc in experimentSpecs:
            if projectId != projectName:
                continue

            # Check if the experiment exists
            experiment = session.query(Experiment).filter_by(name=experimentName, project=projectName).first()
            if experiment is None:
                info("Adding experiment %s for project %s"%(experimentName, projectName))
                experiment = Experiment(experimentName, projectName, experimentDesc)
                project = session.query(Project).filter_by(name=projectName).first()
                if project is None:
                    raise ESGPublishError('No such project: %s'%projectName)
                project.experiments.append(experiment)
                session.add(experiment)
                session.commit()
    except ValueError:
        raise ESGPublishError('experiment_options is misconfigured in section %s: %s'%(projectSection, experimentOption))
Example #21
0
def deleteFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, **context):

    info("Deleting file entries for dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Create a file dictionary for the dataset
    fobjdict = {}                       # file version objects for the new dataset version
    for fobj in dsetVersion.getFileVersions():
        fobjdict[fobj.location] = fobj

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    addNewDatasetVersion = False
    for path, size in pathlist:

        # If the file exists in the dataset, delete the file children (with cascade), and the file
        if fobjdict.has_key(path):
            fileVersionObj = fobjdict[path]
            info("Deleting entry for file %s"%path)

            # If this is the latest dataset version, remove the file variables and reaggregate ...
            if haveLatestDsetVersion:
                fileVersionObj.parent.deleteChildren(session)
                addNewDatasetVersion = True

            # ... otherwise just delete the membership of the file version in the dataset version
            else:
                fileVersionObj.deleteChildren(session)
                session.commit()
            del fobjdict[path]
        else:
            info("File entry not found: %s, skipping"%path)

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return addNewDatasetVersion, fobjdict.values()
Example #22
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False
        
    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if keepVersion:
        if existingVersion<=0:
            newVersion = getInitialDatasetVersion(versionByDate)
        else:
            newVersion = existingVersion
    elif newVersion is None:
        newVersion = getNextDatasetVersion(existingVersion, versionByDate)
        
    dset.reaggregate = False
    # Add a new version
    if addNewVersion and newVersion>existingVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title
        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle)
        info("New dataset version = %d"%newDsetVersionObj.version)
        for var in dset.variables:
            session.delete(var)
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset
Example #23
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None,
                       service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False,
                       schema=None, pid_connector=None, project_config_section=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
Example #24
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "hi:", [
            'database-delete', 'database-only', 'echo-sql', 'map=',
            'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index',
            'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds',
            'use-list='
        ])
    except getopt.error:
        print sys.exc_value
        return

    deleteAll = False
    datasetMap = None
    deleteDset = False
    unpublishOnGateway = False
    echoSql = False
    init_file = None
    gatewayOp = DELETE
    las = False
    log_filename = None
    republish = True
    restApi = None
    thredds = True
    syncThredds = False
    useList = False
    threddsReinit = True
    for flag, arg in args:
        if flag == '--database-delete':
            deleteDset = True
        elif flag == '--database-only':
            gatewayOp = NO_OPERATION
            thredds = False
            deleteDset = True
        elif flag == '--echo-sql':
            echoSql = True
        elif flag in ['-h', '--help']:
            return
        elif flag == '-i':
            init_file = arg
        elif flag == '--map':
            datasetMap = readDatasetMap(arg)
        elif flag == '--skip-gateway':
            gatewayOp = NO_OPERATION
        elif flag == '--skip-index':
            gatewayOp = NO_OPERATION
        elif flag == '--las':
            las = True
        elif flag == '--log':
            log_filename = arg
        elif flag == '--no-republish':
            republish = False
        elif flag == '--no-thredds-reinit':
            threddsReinit = False
        elif flag == '--rest-api':
            restApi = True
        elif flag == '--skip-thredds':
            thredds = False
        elif flag == '--sync-thredds':
            syncThredds = True
        elif flag == '--use-list':
            useList = True
            useListPath = arg

    if gatewayOp != NO_OPERATION and unpublishOnGateway:
        gatewayOp = UNPUBLISH

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'),
                           echo=echoSql,
                           pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    if config is None:
        raise ESGPublishError("No configuration file found.")
    threddsRoot = config.get('DEFAULT', 'thredds_root')

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    if datasetMap is None:
        if not useList:
            datasetNames = [parseDatasetVersionId(item) for item in lastargs]
        else:
            if useListPath == '-':
                namelist = sys.stdin
            else:
                namelist = open(useListPath)
            datasetNames = []
            for line in namelist.readlines():
                versionId = parseDatasetVersionId(line.strip())
                datasetNames.append(versionId)
    else:
        datasetNames = datasetMap.keys()
        datasetNames.sort()
    result = deleteDatasetList(datasetNames,
                               Session,
                               gatewayOp,
                               thredds,
                               las,
                               deleteDset,
                               deleteAll=deleteAll,
                               republish=republish,
                               reinitThredds=threddsReinit,
                               restInterface=restApi)

    # Republish previous versions as needed. This will happen if the latest version
    # was deleted from the database, and is not
    # the only version. In this case the previous version will be rescanned to generate the aggregations.
    if republish:
        statusDict, republishList = result
        if len(republishList) > 0:

            # Register project handlers.
            registerHandlers()

            info("Republishing modified datasets:")
            republishDatasetNames = [
                generateDatasetVersionId(dsetTuple)
                for dsetTuple in republishList
            ]
            dmap, offline = queryDatasetMap(republishDatasetNames, Session)
            datasetNames = dmap.keys()
            datasets = iterateOverDatasets(None,
                                           dmap,
                                           None,
                                           republishList,
                                           Session,
                                           "time",
                                           UPDATE_OP,
                                           None, {},
                                           offline, {},
                                           forceAggregate=True)
            republishOp = (gatewayOp != NO_OPERATION
                           )  # Don't republish if skipping the gateway op
            result = publishDatasetList(datasetNames,
                                        Session,
                                        publish=republishOp,
                                        thredds=thredds)

    # Synchronize database and THREDDS catalogs
    if syncThredds:
        threddsRoot = config.get('DEFAULT', 'thredds_root')

        # Make a dictionary of catalogs from the database
        session = Session()
        subcatalogs = session.query(Catalog).select_from(
            join(Catalog, Dataset,
                 Catalog.dataset_name == Dataset.name)).all()
        catdict = {}
        for catalog in subcatalogs:
            location = os.path.join(threddsRoot, catalog.location)
            catdict[location] = 1
        session.close()

        # Scan all XML files in the threddsroot
        os.path.walk(threddsRoot, cleanupCatalogs, catdict)
Example #25
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0")
        min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0")
        data_specs_version = config.get(project_config_section, "data_specs_version", default="master")
        cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH)
        force_validation = config.getboolean(project_config_section, "force_validation", default=False)
        cmor_table_subdirs = config.getboolean(project_config_section, "cmor_table_subdirs", default=False)

        if not force_validation:

            if self.replica:
                info("skipping PrePARE for replica (file %s)" % f)
                return

            try:
                file_cmor_version = fileobj.getAttribute('cmor_version', None)
            except:
                file_cmor_version = None
                debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f)

            passed_cmor = False
            if compareLibVersions(min_cmor_version, file_cmor_version):
                debug('File %s cmor-ized at version %s, passed!'%(f, file_cmor_version))
                passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError("File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError("File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute('data_specs_version', None)
        except Exception as e:
            raise ESGPublishError("File %s missing required data_specs_version global attribute"%f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if (not force_validation) and passed_cmor:
            return
            
        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        table_dir = getTableDir(cmor_table_path, data_specs_version, cmor_table_subdirs)
        debug("Validating {} using tables dir: {}".format(f, table_dir))

        try:
            process = validator.checkCMIP6(table_dir)
            if process is None:
                raise ESGPublishError("File %s failed the CV check - object create failure"%f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check"%f)
Example #26
0
 def read(self, len):
     result = self._response.read(len)
     if DEBUG:
         messaging.info(`result`)
     return result
    def evt_refresh_list_of_datasets(self, selected_page):

        # Start the busy routine to indicate to the users something is happening
        self.parent.parent.busyCursor = "watch"
        self.parent.parent.busyWidgets = [
            self.parent.parent.pane2.pane("EditPaneTop"),
            self.parent.parent.pane2.pane("EditPaneBottom"),
            self.parent.parent.pane2.pane("EditPaneStatus"),
            self.parent.parent.pane.pane("ControlPane"),
        ]
        pub_busy.busyStart(self.parent.parent)

        try:
            if self.parent.parent.refreshButton[selected_page].cget("relief") == "raised":
                for x in self.parent.parent.main_frame.top_page_id[selected_page]:
                    if self.parent.parent.main_frame.top_page_id[selected_page][x].cget("relief") == "raised":
                        dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget("text")

                        # ganz added this 1/18/11
                        query_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget("text")
                        versionNum = self.parent.parent.main_frame.version_label[selected_page][x].cget("text")
                        #####################################################################################

                        query_name, versionNum = parseDatasetVersionId(dsetVersionName)
                        # ganz TODO test only remove
                        #                  print query_name
                        #                  print versionNum

                        status = pollDatasetPublicationStatus(query_name, self.Session)
                        # ganz catch non selected Red entries to skip 3/20/2011
                        try:
                            self.parent.parent.main_frame.status_label[selected_page][x].configure(
                                text=pub_controls.return_status_text(status)
                            )
                        except:
                            continue

                        # Make sure you update the Ok/Err button
                        # ganz added this (1/18/11) here to catch the case when dset=None (e.g. no local db entry exists)
                        dset = Dataset.lookup(query_name, self.Session)
                        if dset == None:
                            buttonColor = "yellow"
                            buttonText = "Warning"
                            self.parent.parent.main_frame.ok_err[selected_page][x].configure(
                                bg=buttonColor, text=buttonText
                            )
                        elif dset.has_warnings(self.Session):
                            warningLevel = dset.get_max_warning_level(self.Session)
                            if warningLevel >= ERROR_LEVEL:
                                buttonColor = "pink"
                                buttonText = "Error"
                            else:
                                buttonColor = "yellow"
                                buttonText = "Warning"
                            self.parent.parent.main_frame.ok_err[selected_page][x].configure(
                                bg=buttonColor, text=buttonText
                            )
        except:
            pub_busy.busyEnd(self.parent.parent)  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
            pub_busy.busyEnd(self.parent.parent)
        #  pub_busy.busyEnd( self.parent.parent )
        info("Completed refreshing the display.")
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
Example #29
0
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        results = []
        lenresults = len(datasetNames)
        n = spi * lenresults
        j = 0
        for datasetName,versionno in datasetNames:
            if parentId is None:
                parentIdent = handler.getParentId(datasetName)
            elif type(parentId)==type({}):
                parentIdent = parentId[datasetName]
            else:
                parentIdent = parentId
            messaging.info("Publishing: %s"%datasetName)
            dset, statusId, state, evname, status = publishDataset(datasetName, parentIdent, service, threddsRootURL, session, schema=schema, version=versionno)
            messaging.info("  Result: %s"%status.getStateItem())
            results.append((dset, statusId, state))
            resultDict[(datasetName,versionno)] = evname

            # Poll each dataset again
            j += 1
            if state not in (PublicationState.PROCESSING, PublicationState.SUCCESSFUL):
                issueCallback(progressCallback, j*spi, n, 0, 1)
                continue

            for i in range(spi):
                if state==PublicationState.SUCCESSFUL:
                    evname = PUBLISH_DATASET_EVENT
                    event = Event(dset.name, dset.getVersion(), evname)
Example #30
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset)
                    except RemoteCallException, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s"%stateName)
                    resultDict[datasetName] = eventName
            else:                       # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset)
                except RemoteCallException, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                    continue
Example #31
0
    def __invoke(self, method, params):
        # call a method on the remote server

        request = HessianWriter().write_call(method, params)

        # ----------------------------------------------------------------------
        # Patch for HTTP proxy support starts here.  [email protected]
        #
        import httplib, os, urlparse, ssl

        if self._scheme == "http":
            proxy_url = os.environ.get('http_proxy')
            if proxy_url is not None:
                if DEBUG:
                    messaging.info('Proxy detected at %s' % proxy_url)
                proxy_parts = urlparse.urlparse(proxy_url)
                proxy_host = proxy_parts.hostname
                proxy_port = proxy_parts.port
                if proxy_port is None:
                    proxy_port = 80
                h = httplib.HTTPConnection(proxy_host, port=proxy_port)
            else:
                h = httplib.HTTPConnection(self._host, port=self._port)
        else:
            ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
            conn_args = {'port' : self._port,
                         'key_file' : self._key_file,
                         'cert_file': self._cert_file,
                         'context': ctx}
            h = httplib.HTTPSConnection(self._host, **conn_args)

            # test the connection - may need unverified with test index nodes
            # (hopefully not with operational nodes)
            try:
                h.request("HEAD", "/")
                h.getresponse()
            except ssl.SSLError:
                messaging.warning('SSL error - disabling SSL verification')
                conn_args['context'] = ssl._create_unverified_context()
                h = httplib.HTTPSConnection(self._host, **conn_args)

        req_headers = {'Host': self._host,
                       'User-Agent': "hessianlib.py/%s" % __version__,
                       'Content-Length': str(len(request)),
                       }

        if DEBUG:
            messaging.info('Sending request: %s' % `request`)
        h.request("POST", self._url, request, req_headers)
        #
        # End Patch from [email protected]
        # ----------------------------------------------------------------------

        response = h.getresponse()
        headers = response.getheaders()
        errcode = response.status
        errmsg = response.reason
        # errcode, errmsg, headers = h.getreply()

        if errcode != 200:
            raise ProtocolError(self._url, errcode, errmsg, headers)

        # return self.parse_response(h.getfile())
        if DEBUG:
            messaging.info('Got response:')
        responseProxy = ResponseProxy(response)
        return self.parse_response(responseProxy)
Example #32
0
def deleteDatasetList(datasetNames,
                      Session,
                      gatewayOperation=UNPUBLISH,
                      thredds=True,
                      las=False,
                      deleteInDatabase=False,
                      progressCallback=None,
                      deleteAll=False,
                      republish=False,
                      restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d" %
                              gatewayOperation)
    deleteOnGateway = (gatewayOperation == DELETE)
    operation = (gatewayOperation != NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName, version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(
            datasetName,
            version,
            session,
            deleteAll=deleteAll,
            restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s" % datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT', 'hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT',
                                             'hessian_service_debug')
            service = Hessian(serviceURL,
                              servicePort,
                              key_file=serviceKeyfile,
                              cert_file=serviceCertfile,
                              debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT',
                                             'rest_service_debug',
                                             default=False)
            service = RestPublicationService(serviceURL,
                                             serviceCertfile,
                                             keyFile=serviceKeyfile,
                                             debug=serviceDebug)

        for datasetName, version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(
                            versionObj.name,
                            gatewayOperation,
                            service,
                            session,
                            dset=dset)
                    except RemoteCallException, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s" % stateName)
                    resultDict[datasetName] = eventName
            else:  # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (
                        not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(
                        datasetName,
                        gatewayOperation,
                        service,
                        session,
                        dset=dset)
                except RemoteCallException, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[-2:], '\n')))
                    continue
Example #33
0
def extractFromFile(dataset, openfile, fileobj, session, cfHandler, aggdimName=None, varlocate=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname] = pattern

    # For each variable in the file:
    for varname in openfile.inquireVariableList():
        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname], os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        # Create a file variable
        filevar = FileVariable(varname, openfile.getAttribute('long_name', varname, None))
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above

    # Create global attribute
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname=='tracking_id':
            fileVersion.tracking_id = attvalue
        debug('.%s = %s'%(attname, attvalue))
Example #34
0
def extractFromFile(dataset, openfile, fileobj, session, handler, cfHandler, aggdimName=None, varlocate=None, exclude_variables=None, perVariable=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    handler
      Project handler

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    exclude_variables
        List of thredds_exclude_variables

    perVariable
        Boolean, Try to find a target_variable if true and extract all variables if false

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname.strip()] = pattern.strip()

    # Create global attribute
    target_variable = None
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname == 'tracking_id':
            fileVersion.tracking_id = attvalue
        # extract target_variable from global attributes
        if attname == 'variable_id' and perVariable:
            target_variable = attvalue
            debug('Extracted target variable from global attributes: %s' % target_variable)
        debug('.%s = %s' % (attname, attvalue))

    # try to get target_variable from DRS if not found in global attributes
    if not target_variable and perVariable:
        config = getConfig()
        if config is not None:
            drs_pattern = handler.getFilters()[0][1:-1]
            drs_file_pattern = '%s/(?P<filename>[\w.-]+)$' % drs_pattern
            drs_parts = re.search(drs_file_pattern, openfile.path).groupdict()
            if 'variable' in drs_parts:
                target_variable = drs_parts['variable']
                debug('Extracted target variable from DRS: %s' % target_variable)

    # target_variable must be present in the file
    if target_variable not in openfile.inquireVariableList():
        target_variable = None

    # For each variable in the file:
    for varname in openfile.inquireVariableList():

        # we need to extract only target, aggregation and coverage variables
        if target_variable:
            is_coverage_variable = check_coverage_variable(varname, openfile)
            if not is_coverage_variable and varname != target_variable and varname != aggdimName:
                debug("Skipping variable %s in %s (not target (%s), coverage or aggregation (%s) variable)" % (varname, fileVersion.location, target_variable, aggdimName))
                continue

        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname].strip(), os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        is_target_variable = True
        if target_variable and target_variable != varname:
            is_target_variable = False
        elif varname in exclude_variables:
            is_target_variable = False

        # Create a file variable
        varstr = openfile.getAttribute('long_name', varname, None)
        
        if not varstr is None and len(varstr) > 255:
            varstr = varstr[0:255]
        filevar = FileVariable(varname, varstr, is_target_variable=is_target_variable)
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            if var0 is None:
                continue
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above
Example #35
0
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, **context):

    if replace:
        info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version))
    else:
        info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']

    # Get the base dictionary for the entire dataset
    basedict = dset.getBaseDictionary()

    # For each item in the pathlist:
    seq = 0
    fileModified = False                # Any file has been modified (added, replaced, or deleted)
    newFileVersionObjs = []
    nfiles = len(pathlist)
    for path, sizet in pathlist:

        # Rescan this file if it has been added, or replaced
        rescanFile = haveLatestDsetVersion

        size, mtime=sizet
        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion)
            csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion)
            techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Check if 'from_file' was specified for this file
        fromfile = None
        if extraFields is not None:
            fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if fromfile is None:
            oldpath = path
        else:
            frombase = os.path.basename(fromfile)
            tobase = os.path.basename(path)
            if frombase!=tobase:
                info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile))
                oldpath = path
            else:
                oldpath = fromfile

        # If the item is in the current dataset version, get the file version obj and add to the list
        if locdict.has_key(oldpath):
            del todelete[oldpath]
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            
            # If the file matches the existing file version, no-op, ...
            if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum):
                if not forceRescan:
                    info("File %s exists, skipping"%path)
                newFileVersionObjs.append(fileVersionObj)
                rescanFile = False

            # ... else create a new version of the file
            else:
                if oldpath!=path:
                    info("Replacing file %s"%oldpath)
                newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
                newFileVersionObjs.append(newFileVersionObj)
                fileObj.deleteChildren(session)
                fileModified = True

        # Else create a new file / file version object and add to the list ...
        else:
            fileObj = FileFactory(dset, path, basedict, session)
            newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
            newFileVersionObjs.append(newFileVersionObj)
            fileModified = True

        # ... and rescan if necessary
        if rescanFile or forceRescan:
            if not offline:
                info("Scanning %s"%path)
                f = handler.openPath(path)
                extractFromFile(dset, f, fileObj, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context)
                f.close()
            else:
                info("File %s is offline"%path)

        # Callback progress
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # If updating, add the file version objects ...
    if not replace:
        for fileVersionObj in todelete.values():
            newFileVersionObjs.append(fileVersionObj)

    # ... else if rescanning delete the file object children
    elif haveLatestDsetVersion:
        for fileVersionObj in todelete.values():
            fileObj = fileVersionObj.parent
            fileObj.deleteChildren(session)
            fileModified = True

    # Create a new dataset version if:
    # - a file has been added, replaced, or deleted, and
    # - the current version is the latest
    createNewDatasetVersion = haveLatestDsetVersion and fileModified
    
    return createNewDatasetVersion, newFileVersionObjs
Example #36
0
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, **context):

    fobjlist = []                       # File objects in the dataset
    nfiles = len(pathlist)

    basedict = {}                       # file.base => 1
    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    seq = 0
    for path, sizet in pathlist:
        size, mtime = sizet

        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFields.get((dset.name, -1, path, 'checksum'), None)
            csumtype = extraFields.get((dset.name, -1, path, 'checksum_type'), None)
            techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Create a file and version
        base = generateFileBase(path, basedict, dset.name)
        file = File(base, 'netCDF')
        basedict[base] = 1
        fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
        file.versions.append(fileVersion)
        fobjlist.append(fileVersion)
        seq += 1

        dset.files.append(file)

        # Extract the dataset contents
        if not offline:
            info("Scanning %s"%path)
            f = handler.openPath(path)
            extractFromFile(dset, f, file, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context)
            f.close()
        else:
            info("File %s is offline"%path)

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return True, fobjlist
Example #37
0
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, useVersion=-1, **context):

    fobjlist = []                       # File objects in the dataset
    nfiles = len(pathlist)

    basedict = {}                       # file.base => 1
    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    exclude_variables = configOptions['exclude_variables']
    perVariable = configOptions['perVariable']

    seq = 0
    for path, sizet in pathlist:
        size, mtime = sizet

        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None)
            csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None)
            techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Create a file and version
        base = generateFileBase(path, basedict, dset.name)
        file = File(base, 'netCDF')
        basedict[base] = 1
        fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
        file.versions.append(fileVersion)
        fobjlist.append(fileVersion)
        seq += 1

        dset.files.append(file)

        # Extract the dataset contents
        if not offline:
            info("Scanning %s"%path)
            f = handler.openPath(path)
            extractFromFile(dset, f, file, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context)
            f.close()
        else:
            info("File %s is offline"%path)

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return True, fobjlist
Example #38
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)
        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.
        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate:
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Example #39
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP,
                       progressCallback=None, stopEvent=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None,
                       comment=None, useVersion=-1, forceRescan=False, nodbwrite=False, pid_connector=None, test_publication=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    pid_connector
        ESGF_PID_connector object to register PIDs

    test_publication
        Flag whether publication is for production or test

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)

        if not offline:
            if perVariable is None:
                perVariable = config.getboolean(section, 'variable_per_file', False)
            else:
                perVariable = False
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False

    exclude_variables = splitLine(config.get(section, 'thredds_exclude_variables', default=''), sep=',')

    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType
    configOptions['exclude_variables'] = exclude_variables
    configOptions['perVariable'] = perVariable

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if (nodbwrite): 
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        info("dataset scan complete, not writing to database")
        return dset
       
    elif operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, useVersion=useVersion, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        if operation==REPLACE_OP:
            versionObj = dset.getVersionObj(-1)
        else:
            versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, useVersion=useVersion, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if useVersion == -1:
        if keepVersion:
            if existingVersion<=0:
                newVersion = getInitialDatasetVersion(versionByDate)
            else:
                newVersion = existingVersion
        elif newVersion is None:
            newVersion = getNextDatasetVersion(existingVersion, versionByDate)
    else:
        newVersion = useVersion

    dset.reaggregate = False

    if newVersion<existingVersion:
        versionList = dset.getVersionList()
        if newVersion in versionList:
            addNewVersion = False

    # Add a new version
    if addNewVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title

        # if project uses PIDs, generate PID for dataset
        dataset_pid = None
        if pid_connector:
            dataset_pid = pid_connector.make_handle_from_drsid_and_versionnumber(drs_id=datasetName, version_number=newVersion)
            info("Assigned PID to dataset %s.v%s: %s " % (datasetName, newVersion, dataset_pid))

        # if project uses citation, build citation url
        project_config_section = 'config:%s' %context.get('project')
        citation_url = handler.get_citation_url(project_config_section, config, datasetName, newVersion, test_publication)

        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes,
                                                  tech_notes_title=datasetTechNotesTitle, pid=dataset_pid, citation_url=citation_url)

        info("New dataset version = %d"%newDsetVersionObj.version)
        
        try:
            for var in dset.variables:
                session.delete(var)
        except IntegrityError as ie:
            debug("sqlalchemy IntegrityError: " + str(ie))
            raise ESGPublishError("Error in creating dataset version, did you already publish this version to the database?")
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset
Example #40
0
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, useVersion=-1, **context):

    if replace:
        info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version))
    else:
        info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    exclude_variables = configOptions['exclude_variables']
    perVariable = configOptions['perVariable']

    # Get the base dictionary for the entire dataset
    basedict = dset.getBaseDictionary()

    # For each item in the pathlist:
    seq = 0
    fileModified = False                # Any file has been modified (added, replaced, or deleted)
    newFileVersionObjs = []
    nfiles = len(pathlist)
    for path, sizet in pathlist:

        # Rescan this file if it has been added, or replaced
        rescanFile = haveLatestDsetVersion

        size, mtime=sizet
        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            if useVersion != -1:
                csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None)
                csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None)
            else:
                csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion)
                csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion)
            techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Check if 'from_file' was specified for this file
        fromfile = None
        if extraFields is not None:
            fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if fromfile is None:
            oldpath = path
        else:
            frombase = os.path.basename(fromfile)
            tobase = os.path.basename(path)
            if frombase!=tobase:
                info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile))
                oldpath = path
            else:
                oldpath = fromfile

        # If the item is in the current dataset version, get the file version obj and add to the list
        if locdict.has_key(oldpath):
            del todelete[oldpath]
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            
            # If the file matches the existing file version, no-op, ...
            if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum):
                if not forceRescan:
                    info("File %s exists, skipping"%path)
                newFileVersionObjs.append(fileVersionObj)
                rescanFile = False

            # ... else create a new version of the file
            else:
                if oldpath!=path:
                    info("Replacing file %s"%oldpath)
                newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
                newFileVersionObjs.append(newFileVersionObj)
                fileObj.deleteChildren(session)
                fileModified = True

        # Else create a new file / file version object and add to the list ...
        else:
            fileObj = FileFactory(dset, path, basedict, session)
            newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
            newFileVersionObjs.append(newFileVersionObj)
            fileModified = True

        # ... and rescan if necessary
        if rescanFile or forceRescan:
            if not offline:
                info("Scanning %s"%path)
                f = handler.openPath(path)
                extractFromFile(dset, f, fileObj, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context)
                f.close()
            else:
                info("File %s is offline"%path)

        # Callback progress
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # If updating, add the file version objects ...
    if not replace:
        for fileVersionObj in todelete.values():
            newFileVersionObjs.append(fileVersionObj)

    # ... else if rescanning delete the file object children
    elif haveLatestDsetVersion:
        for fileVersionObj in todelete.values():
            fileObj = fileVersionObj.parent
            fileObj.deleteChildren(session)
            fileModified = True

    # Create a new dataset version if:
    # - a file has been added, replaced, or deleted, and
    # - the current version is the latest
    createNewDatasetVersion = haveLatestDsetVersion and fileModified

    return createNewDatasetVersion, newFileVersionObjs
Example #41
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
Example #42
0
    # and reinitialize the THREDDS server.
    if thredds:
        threddsRoot = config.get('DEFAULT', 'thredds_root')
        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if dset is None:
                continue
            for versionObj in versionObjs:
                # send unpublication info to handle server
                if pid_connector:
                    pid_connector.unpublish_one_version(drs_id=datasetName, version_number=versionObj.version)
                catalog = session.query(Catalog).filter_by(dataset_name=dset.name, version=versionObj.version).first()
                if catalog is not None:
                    path = os.path.join(threddsRoot, catalog.location)
                    if os.path.exists(path):
                        info("Deleting THREDDS catalog: %s"%path)
                        os.unlink(path)
                        event = Event(dset.name, versionObj.version, DELETE_THREDDS_CATALOG_EVENT)
                        dset.events.append(event)
                    session.delete(catalog)

        session.commit()
        if reinitThredds:
            updateThreddsMasterCatalog(Session)
            result = reinitializeThredds()

    # Delete the database entry (optional).
    if republish:
        republishList = []
    if deleteInDatabase:
        for datasetName,version in datasetNames:
Example #43
0
def iterateOverDatasets(projectName,
                        dmap,
                        directoryMap,
                        datasetNames,
                        Session,
                        aggregateDimension,
                        operation,
                        filefilt,
                        initcontext,
                        offlineArg,
                        properties,
                        testProgress1=None,
                        testProgress2=None,
                        handlerDictionary=None,
                        perVariable=None,
                        keepVersion=False,
                        newVersion=None,
                        extraFields=None,
                        masterGateway=None,
                        comment=None,
                        forceAggregate=False,
                        readFiles=False,
                        nodbwrite=False,
                        pid_connector=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct):
        datasetName, versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError(
                    "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list."
                )

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s" %
                                      datasetName)

        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate = False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName, versionno)]) == 0:
                warning("No files specified for dataset %s, version %d." %
                        (datasetName, versionno))
                continue
            firstFile = dmap[(datasetName, versionno)][0][0]
            fileiter = datasetMapIterator(dmap,
                                          datasetName,
                                          versionno,
                                          extraFields=extraFields,
                                          offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter = multiDirectoryIterator(
                    [direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator(
                    [sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(
                datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=True,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name
            info("Using project name = %s" % projectName)
        if prevProject is not None and projectName != prevProject:
            raise ESGPublishError(
                "Multiple projects found: %s, %s. Can only publish from one project"
                % (prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored' % name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset = None
        if testProgress1 is not None:
            testProgress1[1] = (100. / ct) * iloop
            if not offline:
                testProgress1[2] = (100. / ct) * iloop + (50. / ct)
            else:
                testProgress1[2] = (100. / ct) * iloop + (100. / ct)

        dataset = extractFromDataset(datasetName,
                                     fileiter,
                                     Session,
                                     handler,
                                     cfHandler,
                                     aggregateDimensionName=aggregateDimension,
                                     offline=offline,
                                     operation=operation,
                                     progressCallback=testProgress1,
                                     perVariable=perVariable,
                                     keepVersion=keepVersion,
                                     newVersion=newVersion,
                                     extraFields=extraFields,
                                     masterGateway=masterGateway,
                                     comment=comment,
                                     useVersion=versionno,
                                     forceRescan=forceAggregate,
                                     nodbwrite=nodbwrite,
                                     pid_connector=pid_connector,
                                     **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
            testProgress2[1] = (100. / ct) * iloop + 50. / ct
            testProgress2[2] = (100. / ct) * (iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName,
                               Session,
                               aggregateDimensionName=aggregateDimension,
                               cfHandler=cfHandler,
                               progressCallback=testProgress2,
                               datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)

        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Example #44
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Example #45
0
    # and reinitialize the THREDDS server.
    if thredds:
        threddsRoot = config.get('DEFAULT', 'thredds_root')
        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if dset is None:
                continue
            for versionObj in versionObjs:
                # send unpublication info to handle server
                if pid_connector:
                    pid_connector.unpublish_one_version(drs_id=datasetName, version_number=versionObj.version)
                catalog = session.query(Catalog).filter_by(dataset_name=dset.name, version=versionObj.version).first()
                if catalog is not None:
                    path = os.path.join(threddsRoot, catalog.location)
                    if os.path.exists(path):
                        info("Deleting THREDDS catalog: %s"%path)
                        os.unlink(path)
                        event = Event(dset.name, versionObj.version, DELETE_THREDDS_CATALOG_EVENT)
                        dset.events.append(event)
                    session.delete(catalog)

        session.commit()
        if reinitThredds:
            updateThreddsMasterCatalog(Session)
            result = reinitializeThredds()

    # Delete the database entry (optional).
    if republish:
        republishList = []
    if deleteInDatabase:
        for datasetName,version in datasetNames:
Example #46
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            service_certs_location = getServiceCertsLoc()
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                if service.service_type == 'REST':
                    error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName)
                    continue
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Example #47
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section,
                                      "min_cmor_version",
                                      default="0.0.0")
        min_ds_version = config.get(project_section,
                                    "min_data_specs_version",
                                    default="0.0.0")
        data_specs_version = config.get(project_config_section,
                                        "data_specs_version",
                                        default="master")
        cmor_table_path = config.get(project_config_section,
                                     "cmor_table_path",
                                     default=DEFAULT_CMOR_TABLE_PATH)
        force_validation = config.getboolean(project_config_section,
                                             "force_validation",
                                             default=False)
        skip_validation = config.getboolean(project_config_section,
                                            "skip_validation",
                                            default=False)
        cmor_table_subdirs = config.getboolean(project_config_section,
                                               "cmor_table_subdirs",
                                               default=False)

        if skip_validation:

            if force_validation:
                raise ESGPublishError(
                    "skip_validation and force_validation both enabled in config"
                )

            info("skipping PrePARE because skip_validation set in config")
            return

        if not force_validation:

            if self.replica:
                info("skipping PrePARE for replica (file %s)" % f)
                return

            try:
                file_cmor_version = fileobj.getAttribute('cmor_version', None)
            except:
                file_cmor_version = None
                debug(
                    'File %s missing cmor_version attribute; will proceed with PrePARE check'
                    % f)

            passed_cmor = False
            if compareLibVersions(min_cmor_version, file_cmor_version):
                debug('File %s cmor-ized at version %s, passed!' %
                      (f, file_cmor_version))
                passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError(
                "File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError(
                "File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute(
                'data_specs_version', None)
        except Exception as e:
            raise ESGPublishError(
                "File %s missing required data_specs_version global attribute"
                % f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError(
                "File %s data_specs_version is %s, which is less than the required minimum version of %s"
                % (f, file_data_specs_version, min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if (not force_validation) and passed_cmor:
            return

        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        table_dir = getTableDir(cmor_table_path, data_specs_version,
                                cmor_table_subdirs)
        debug("Validating {} using tables dir: {}".format(f, table_dir))

        try:
            process = validator.checkCMIP6(table_dir)
            if process is None:
                raise ESGPublishError(
                    "File %s failed the CV check - object create failure" % f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check" % f)
Example #48
0
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        results = []
        lenresults = len(datasetNames)
        n = spi * lenresults
        j = 0
        for datasetName,versionno in datasetNames:
            if parentId is None:
                parentIdent = handler.getParentId(datasetName)
            elif type(parentId)==type({}):
                parentIdent = parentId[datasetName]
            else:
                parentIdent = parentId
            messaging.info("Publishing: %s"%datasetName)
            dset, statusId, state, evname, status = publishDataset(datasetName, parentIdent, service, threddsRootURL, session, schema=schema, version=versionno)
            messaging.info("  Result: %s"%status.getStateItem())
            results.append((dset, statusId, state))
            resultDict[(datasetName,versionno)] = evname

            # Poll each dataset again
            j += 1
            if state not in (PublicationState.PROCESSING, PublicationState.SUCCESSFUL):
                issueCallback(progressCallback, j*spi, n, 0, 1)
                continue

            for i in range(spi):
                if state==PublicationState.SUCCESSFUL:
                    evname = PUBLISH_DATASET_EVENT
                    event = Event(dset.name, dset.getVersion(), evname)
Example #49
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds]:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Example #50
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # list of all target variables of a dataset
    dset_target_vars = set()

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:
            if filevar.is_target_variable:
                dset_target_vars.add(filevar.short_name)

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()