Example #1
0
def processIterator(command, commandArgs, filefilt=None, offline=False):
    """Create an iterator from an external process.

    Returns an iterator that returns (path, size) at each iteration.
    
    command
      Command string to execute the process - for example,
      "/some/python/bin/hsils.py".  The process must write to stdout,
      a blank-separated "path size" on each line.

    commandArgs
      String arguments to the process.

    filefilt
      A regular expression as defined in the Python re module. Each file returned has basename matching the filter.

    offline
      Boolean, if True don't try to stat files.

    """

    try:
        f = subprocess.Popen(command+" "+commandArgs, shell=True, stdout=subprocess.PIPE).stdout
    except:
        error("Error running command '%s %s', check configuration option 'offline_lister_executable'."%(command, commandArgs))
        raise
    for path, size in filelistIterator_1(f, filefilt, offline=offline):
        yield (path, size)
    f.close()
    return
    def catalog_thredds( self, parent ):
        from esgcet.publish import generateThredds
        from tkMessageBox import showerror

        dialog_icon = tkFileDialog.SaveAs(master=self.parent.control_frame2,
                         filetypes=[("THREDDS", "*.thredds", "THREDDS XML"), ("XML", "*.xml", "THREDDS XML")], title = 'File Open Selection')
        dirfilename=dialog_icon.show(initialdir=os.getcwd())
        if len(dirfilename)==0:
            return
        dir = dirfilename[:dirfilename.rfind('/')]
        filename = dirfilename[dirfilename.rfind('/')+1:]
        name = filename[:filename.rfind('.')].strip()
       
        # Check for directory and filename error
        if dirfilename in [(), '']:
           showerror("ESGPublishError", "There was an error in the selected directory and specified filename." )
           return
        if name in [(), '']:
           showerror("ESGPublishError", "There was an error in the specified filename." )
           return
        if os.access(dir, os.W_OK) != True:
           showerror("ESGPublishError", "You do not have write permission for the selected directory." )
           return

        # Generate a THREDDS configuration file
        threddsOutputPath = dirfilename
        threddsOutput = open(threddsOutputPath, "w")
        try:
            selected_page = self.parent.parent.main_frame.selected_top_page
            datasetName = self.parent.parent.datasetName
            generateThredds(datasetName, self.parent.parent.Session, threddsOutput, self.parent.parent.handlerDictionary[datasetName])
        except Exception, inst:
             error(traceback.format_exc())
             showerror("ESGPublishError", inst )
Example #3
0
def processIterator(command, commandArgs, filefilt=None, offline=False):
    """Create an iterator from an external process.

    Returns an iterator that returns (path, size) at each iteration.
    
    command
      Command string to execute the process - for example,
      "/some/python/bin/hsils.py".  The process must write to stdout,
      a blank-separated "path size" on each line.

    commandArgs
      String arguments to the process.

    filefilt
      A regular expression as defined in the Python re module. Each file returned has basename matching the filter.

    offline
      Boolean, if True don't try to stat files.

    """

    try:
        f = subprocess.Popen(command + " " + commandArgs,
                             shell=True,
                             stdout=subprocess.PIPE).stdout
    except:
        error(
            "Error running command '%s %s', check configuration option 'offline_lister_executable'."
            % (command, commandArgs))
        raise
    for path, size in filelistIterator_1(f, filefilt, offline=offline):
        yield (path, size)
    f.close()
    return
Example #4
0
def checksum(path, client):
    """
    Calculate a file checksum.

    Returns the String checksum.

    path
      String pathname.

    client
      String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path.
    """

    if not os.path.exists(path):
        raise ESGPublishError("No such file: %s"%path)

    command = "%s %s"%(client, path)
    info("Running: %s"%command)

    try:
        f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout
    except:
        error("Error running command '%s %s', check configuration option 'checksum'."%command)
    lines = f.readlines()
    csum = lines[0].split()[0]

    return csum
Example #5
0
def checksum(path, client):
    """
    Calculate a file checksum.

    Returns the String checksum.

    path
      String pathname.

    client
      String client name. The command executed is '``client path``'. The client may be an absolute path ("/usr/bin/md5sum") or basename ("md5sum"). For a basename, the executable must be in the user's search path.
    """

    if not os.path.exists(path):
        raise ESGPublishError("No such file: %s" % path)

    command = "%s %s" % (client, path)
    info("Running: %s" % command)

    try:
        f = subprocess.Popen([client, path], stdout=subprocess.PIPE).stdout
    except:
        error(
            "Error running command '%s %s', check configuration option 'checksum'."
            % command)
    lines = f.readlines()
    csum = lines[0].split()[0]

    return csum
def calc_checksum(filepath, checksum_client):
    csum = None
    if os.path.exists(filepath):
        command = "%s %s"%(checksum_client, filepath)
        info("Running: %s"%command)
        try:
            f = os.popen(command).read()
            csum = f.split(' ')[0]
        except:
            error("Error running command '%s', check configuration option 'checksum'."%command)

    return filepath, csum
Example #7
0
    def loadEntryPoints(self):
        """
        Get the entry points for the entry point group associated with this registry,
        and build an entry point dictionary.
        """
        optionDict = {}
        distPlugins = {
        }  # distPlugins: entry_point_distribution => distribution_dict
        #   where distribution_dict: entry_point_name => handler_class

        for ep in iter_entry_points(self.entryPointGroup):
            if distPlugins.has_key(ep.dist):
                distPlugins[ep.dist][ep.name] = ep
            else:
                distPlugins[ep.dist] = {ep.name: ep}

        for dist, v in distPlugins.items():
            if v.has_key(HANDLER_NAME_ENTRY_POINT):
                if v.has_key(HANDLER_ENTRY_POINT):
                    handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[
                            handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist,
                                   handlerName, handlerClassName))
                    else:
                        optionDict[handlerName] = (v[HANDLER_ENTRY_POINT],
                                                   dist, True)
                else:
                    warning("Distribution %s does not define a %s option." %
                            (k, HANDLER_ENTRY_POINT))
            elif v.has_key(HANDLER_DICT_ENTRY_POINT):
                handlerDict = v[HANDLER_DICT_ENTRY_POINT].load()
                for handlerName, handlerClassName in handlerDict.items():
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[
                            handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist,
                                   handlerName, handlerClassName))
                    else:
                        optionDict[handlerName] = (handlerClassName, dist,
                                                   False)
        return optionDict
Example #8
0
def pollDatasetPublicationStatus(datasetName, Session, service=None):
    """
    Get the current dataset publication status by polling the gateway.

    Returns the current dataset publication status.
    
    datasetNames
      A list of string dataset names.

    Session
      A database Session.

    service
      Web service proxy instance. If None, the service is created.

    """

    session = Session()
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is None:
        messaging.error("Dataset not found: %s" % datasetName)
        session.close()
        return PUBLISH_FAILED_EVENT

    status = dset.get_publication_status()
    if status != START_PUBLISH_DATASET_EVENT:
        session.close()
        return status

    if service is None:
        config = getConfig()
        serviceURL = getHessianServiceURL()
        servicePort = config.getint('DEFAULT', 'hessian_service_port')
        serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        service = Hessian(serviceURL,
                          servicePort,
                          key_file=serviceKeyfile,
                          cert_file=serviceCertfile,
                          debug=serviceDebug)

    try:
        statusObj = PublicationStatus(dset.status_id, service)
    except socket.error, e:
        raise ESGPublishError(
            "Socket error: %s\nIs the proxy certificate %s valid?" %
            ( ` e `, service._cert_file))
    def catalog_thredds(self, parent):
        from esgcet.publish import generateThredds
        from tkMessageBox import showerror

        dialog_icon = tkFileDialog.SaveAs(master=self.parent.control_frame2,
                                          filetypes=[("THREDDS", "*.thredds",
                                                      "THREDDS XML"),
                                                     ("XML", "*.xml",
                                                      "THREDDS XML")],
                                          title='File Open Selection')
        dirfilename = dialog_icon.show(initialdir=os.getcwd())
        if len(dirfilename) == 0:
            return
        dir = dirfilename[:dirfilename.rfind('/')]
        filename = dirfilename[dirfilename.rfind('/') + 1:]
        name = filename[:filename.rfind('.')].strip()

        # Check for directory and filename error
        if dirfilename in [(), '']:
            showerror(
                "ESGPublishError",
                "There was an error in the selected directory and specified filename."
            )
            return
        if name in [(), '']:
            showerror("ESGPublishError",
                      "There was an error in the specified filename.")
            return
        if os.access(dir, os.W_OK) != True:
            showerror(
                "ESGPublishError",
                "You do not have write permission for the selected directory.")
            return

        # Generate a THREDDS configuration file
        threddsOutputPath = dirfilename
        threddsOutput = open(threddsOutputPath, "w")
        try:
            selected_page = self.parent.parent.main_frame.selected_top_page
            datasetName = self.parent.parent.datasetName
            generateThredds(datasetName, self.parent.parent.Session,
                            threddsOutput,
                            self.parent.parent.handlerDictionary[datasetName])
        except Exception, inst:
            error(traceback.format_exc())
            showerror("ESGPublishError", inst)
Example #10
0
    def loadEntryPoints(self):
        """
        Get the entry points for the entry point group associated with this registry,
        and build an entry point dictionary.
        """
        optionDict = {}
        distPlugins = {}  # distPlugins: entry_point_distribution => distribution_dict
        #   where distribution_dict: entry_point_name => handler_class

        for ep in iter_entry_points(self.entryPointGroup):
            if distPlugins.has_key(ep.dist):
                distPlugins[ep.dist][ep.name] = ep
            else:
                distPlugins[ep.dist] = {ep.name: ep}

        for dist, v in distPlugins.items():
            if v.has_key(HANDLER_NAME_ENTRY_POINT):
                if v.has_key(HANDLER_ENTRY_POINT):
                    handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)
                            )
                    else:
                        optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True)
                else:
                    warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT))
            elif v.has_key(HANDLER_DICT_ENTRY_POINT):
                handlerDict = v[HANDLER_DICT_ENTRY_POINT].load()
                for handlerName, handlerClassName in handlerDict.items():
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)
                            )
                    else:
                        optionDict[handlerName] = (handlerClassName, dist, False)
        return optionDict
Example #11
0
def pollDatasetPublicationStatus(datasetName, Session, service=None):
    """
    Get the current dataset publication status by polling the gateway.

    Returns the current dataset publication status.
    
    datasetNames
      A list of string dataset names.

    Session
      A database Session.

    service
      Web service proxy instance. If None, the service is created.

    """

    session = Session()
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is None:
        messaging.error("Dataset not found: %s"%datasetName)
        session.close()
        return PUBLISH_FAILED_EVENT
    
    status = dset.get_publication_status()
    if status!=START_PUBLISH_DATASET_EVENT:
        session.close()
        return status

    if service is None:
        config = getConfig()
        serviceURL = getHessianServiceURL()
        servicePort = config.getint('DEFAULT','hessian_service_port')
        serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
    
    try:
        statusObj = PublicationStatus(dset.status_id, service)
    except socket.error, e:
        raise ESGPublishError("Socket error: %s\nIs the proxy certificate %s valid?"%(`e`, service._cert_file))
Example #12
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None,
                       service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False,
                       schema=None, pid_connector=None, project_config_section=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
Example #13
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            service_certs_location = getServiceCertsLoc()
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                if service.service_type == 'REST':
                    error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName)
                    continue
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Example #14
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Example #15
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds]:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Example #16
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # list of all target variables of a dataset
    dset_target_vars = set()

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:
            if filevar.is_target_variable:
                dset_target_vars.add(filevar.short_name)

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Example #17
0
def iterateOverDatasets(projectName,
                        dmap,
                        directoryMap,
                        datasetNames,
                        Session,
                        aggregateDimension,
                        operation,
                        filefilt,
                        initcontext,
                        offlineArg,
                        properties,
                        testProgress1=None,
                        testProgress2=None,
                        handlerDictionary=None,
                        perVariable=None,
                        keepVersion=False,
                        newVersion=None,
                        extraFields=None,
                        masterGateway=None,
                        comment=None,
                        forceAggregate=False,
                        readFiles=False,
                        nodbwrite=False,
                        pid_connector=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct):
        datasetName, versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError(
                    "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list."
                )

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s" %
                                      datasetName)

        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate = False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName, versionno)]) == 0:
                warning("No files specified for dataset %s, version %d." %
                        (datasetName, versionno))
                continue
            firstFile = dmap[(datasetName, versionno)][0][0]
            fileiter = datasetMapIterator(dmap,
                                          datasetName,
                                          versionno,
                                          extraFields=extraFields,
                                          offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter = multiDirectoryIterator(
                    [direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator(
                    [sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(
                datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=True,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name
            info("Using project name = %s" % projectName)
        if prevProject is not None and projectName != prevProject:
            raise ESGPublishError(
                "Multiple projects found: %s, %s. Can only publish from one project"
                % (prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored' % name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset = None
        if testProgress1 is not None:
            testProgress1[1] = (100. / ct) * iloop
            if not offline:
                testProgress1[2] = (100. / ct) * iloop + (50. / ct)
            else:
                testProgress1[2] = (100. / ct) * iloop + (100. / ct)

        dataset = extractFromDataset(datasetName,
                                     fileiter,
                                     Session,
                                     handler,
                                     cfHandler,
                                     aggregateDimensionName=aggregateDimension,
                                     offline=offline,
                                     operation=operation,
                                     progressCallback=testProgress1,
                                     perVariable=perVariable,
                                     keepVersion=keepVersion,
                                     newVersion=newVersion,
                                     extraFields=extraFields,
                                     masterGateway=masterGateway,
                                     comment=comment,
                                     useVersion=versionno,
                                     forceRescan=forceAggregate,
                                     nodbwrite=nodbwrite,
                                     pid_connector=pid_connector,
                                     **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
            testProgress2[1] = (100. / ct) * iloop + 50. / ct
            testProgress2[2] = (100. / ct) * (iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName,
                               Session,
                               aggregateDimensionName=aggregateDimension,
                               cfHandler=cfHandler,
                               progressCallback=testProgress2,
                               datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)

        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
    def dataset_page(self, dataset=None, Session=None, handler=None):
        if handler != None:
            try:
                self.parent.canvas.pack_forget()  # Remove the white canvas
            except:
                pass

        self.field_list = {}
        validate = []
        mandatory = []
        options = {}
        values = {}
        #--------------------------------------------------------------------------------
        # Generate a dataset page with dummy fields if no dataset was given
        #--------------------------------------------------------------------------------
        if handler == None:
            return_fields = [
                "Project", "Dataset name", "Model", "Experiment", "Run number",
                "Product", "Format"
            ]
            validate = [1, 2, 3, 4, 1, 2, 3, 4, 1]
            mandatory = [
                True, False, True, False, True, False, True, False, True
            ]
            options = values = {
                "Project": None,
                "Dataset name": None,
                "Model": None,
                "Experiment": None,
                "Run number": None,
                "Product": None,
                "Format": None
            }
        else:
            #--------------------------------------------------------------------------------
            # Retrieve the dataset fields and properties from the queryDataset command
            #--------------------------------------------------------------------------------
            list_fields = getQueryFields(handler)
            properties = {'id': (1, dataset.get_id(Session))}
            for x in list_fields:
                if x is not "id": properties[x] = (2, "%")
            values, return_fields = queryDatasets(dataset.get_project(Session),
                                                  handler, Session, properties)
            for x in return_fields:
                validate.append(handler.getFieldType(x))
                options[x] = handler.getFieldOptions(x)
                mandatory.append(handler.isMandatory(x))

        #--------------------------------------------------------------------------------
        # View the dataset fields in the page
        #--------------------------------------------------------------------------------
        for i in range(len(return_fields)):
            #print " ganz dataset test %s", return_fields[i]
            #print values[0][i]
            value = values[0][i]
            try:
                self.field_list[return_fields[i]] = show_field(
                    self.parent, self.dataset_frame,
                    return_fields[i].capitalize(), options[return_fields[i]],
                    value, mandatory[i], validate[i])
            except:
                field = return_fields[i]
                opts = options[field]
                mand = mandatory[i]
                valid = validate[i]
                error(
                    "Error in show_fields: field=%s, options=%s, value=%s, mandatory=%s, validate=%s"
                    % (field, ` opts `, ` value `, mand, valid))
                error(traceback.format_exc())
                raise

        Pmw.alignlabels(
            self.field_list.values())  # alien the labels for a clean look

        self.dataset_frame.pack(side='left', expand=1, fill='both', pady=2)

        #--------------------------------------------------------------------------------
        # Create and pack the Group to display the message about mandatory fields
        #--------------------------------------------------------------------------------
        txtFont = tkFont.Font(self.parent,
                              family=pub_controls.text_font_type,
                              size=pub_controls.text_font_size,
                              weight=font_weight)
        g = Pmw.Group(self.dataset_sframe.interior(),
                      tag_text='Mandatory Fields',
                      tag_font=txtFont)
        g.pack(fill='x', padx=36)
        cw = Tkinter.Label(
            g.interior(),
            text=
            'All fields that begin with an asterisk\n"*" and in blue, must have an entry.',
            font=txtFont)
        cw.pack(padx=2, pady=2, expand='yes', fill='both')
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
Example #20
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset)
                    except RemoteCallException, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s"%stateName)
                    resultDict[datasetName] = eventName
            else:                       # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset)
                except RemoteCallException, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                    continue
Example #21
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
Example #22
0
def deleteDatasetList(datasetNames,
                      Session,
                      gatewayOperation=UNPUBLISH,
                      thredds=True,
                      las=False,
                      deleteInDatabase=False,
                      progressCallback=None,
                      deleteAll=False,
                      republish=False,
                      restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d" %
                              gatewayOperation)
    deleteOnGateway = (gatewayOperation == DELETE)
    operation = (gatewayOperation != NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName, version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(
            datasetName,
            version,
            session,
            deleteAll=deleteAll,
            restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s" % datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT', 'hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT',
                                             'hessian_service_debug')
            service = Hessian(serviceURL,
                              servicePort,
                              key_file=serviceKeyfile,
                              cert_file=serviceCertfile,
                              debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT',
                                             'rest_service_debug',
                                             default=False)
            service = RestPublicationService(serviceURL,
                                             serviceCertfile,
                                             keyFile=serviceKeyfile,
                                             debug=serviceDebug)

        for datasetName, version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(
                            versionObj.name,
                            gatewayOperation,
                            service,
                            session,
                            dset=dset)
                    except RemoteCallException, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s" % stateName)
                    resultDict[datasetName] = eventName
            else:  # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (
                        not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(
                        datasetName,
                        gatewayOperation,
                        service,
                        session,
                        dset=dset)
                except RemoteCallException, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[-2:], '\n')))
                    continue
Example #23
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)
        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.
        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate:
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Example #24
0
   def dataset_page( self, dataset = None, Session = None, handler = None ):
        if handler != None:
           try: self.parent.canvas.pack_forget()  # Remove the white canvas
           except: pass

        self.field_list = {}
        validate = []
        mandatory = []
        options = {}
        values = {}
        #--------------------------------------------------------------------------------
        # Generate a dataset page with dummy fields if no dataset was given
        #--------------------------------------------------------------------------------
        if handler == None:
           return_fields = ["Project", "Dataset name", "Model", "Experiment", "Run number", "Product", "Format"]
           validate = [1,2,3,4,1,2,3,4,1]
           mandatory = [True,False,True,False,True,False,True,False,True]
           options = values = {"Project":None, "Dataset name":None, "Model":None, "Experiment":None, "Run number":None, "Product":None, "Format":None}
        else:
           #--------------------------------------------------------------------------------
           # Retrieve the dataset fields and properties from the queryDataset command
           #--------------------------------------------------------------------------------
           list_fields = getQueryFields( handler )
           properties = {'id':(1, dataset.get_id( Session ))}
           for x in list_fields:
               if x is not "id": properties[ x ] = (2, "%")
           values, return_fields = queryDatasets(dataset.get_project( Session ), handler, Session, properties)
           for x in return_fields:
                validate.append( handler.getFieldType( x ) )
                options[ x ] = handler.getFieldOptions( x )
                mandatory.append( handler.isMandatory( x ) )

        #--------------------------------------------------------------------------------
        # View the dataset fields in the page
        #--------------------------------------------------------------------------------
        for i in range(len(return_fields)):
            #print " ganz dataset test %s", return_fields[i]
            #print values[0][i]
            value = values[0][i]
            try:
                self.field_list[return_fields[i]] = show_field( self.parent, self.dataset_frame, return_fields[i].capitalize(), options[ return_fields[i] ], value, mandatory[i], validate[i] )
            except:
                field = return_fields[i]
                opts = options[field]
                mand = mandatory[i]
                valid = validate[i]
                error("Error in show_fields: field=%s, options=%s, value=%s, mandatory=%s, validate=%s"%(field, `opts`, `value`, mand, valid))
                error(traceback.format_exc())
                raise

        Pmw.alignlabels (self.field_list.values()) # alien the labels for a clean look

        self.dataset_frame.pack(side = 'left', expand=1, fill='both' , pady = 2)

        #--------------------------------------------------------------------------------
        # Create and pack the Group to display the message about mandatory fields
        #--------------------------------------------------------------------------------
        txtFont=tkFont.Font(self.parent, family = pub_controls.text_font_type, size=pub_controls.text_font_size, weight=font_weight)
	g = Pmw.Group(self.dataset_sframe.interior(),
                      tag_text='Mandatory Fields',
                      tag_font=txtFont
                     )
	g.pack(fill = 'x', padx = 36)
	cw = Tkinter.Label(g.interior(),
		text = 'All fields that begin with an asterisk\n"*" and in blue, must have an entry.',
                font = txtFont
             )
	cw.pack(padx = 2, pady = 2, expand='yes', fill='both')