Ejemplo n.º 1
0
def checkAndUpdateRepo(cmor_table_path, handler, ds_version):
    """
        Checks for a file written to a predefined location.  if not present or too old, will pull the repo based on the input path argument and update the timestamp.
    """
    pull_cmor_repo = False

    if os.path.exists(UPDATE_TIMESTAMP):
        mtime = os.path.getmtime(UPDATE_TIMESTAMP)
        now = time()
        if now - mtime > (86400.0):
            pull_cmor_repo = True 

    else:
        pull_cmor_repo = True

    if pull_cmor_repo:

        try:
            os.system("pushd "+cmor_table_path+" ; git pull ; popd")
            f = open(UPDATE_TIMESTAMP, "w")
            f.write("t")
            f.close()
        except Exception as e :
            warning("Attempt to update the cmor table repo and encountered an error: " + str(e))

    if handler.data_specs_version != ds_version:
        try:
            os.system("pushd "+cmor_table_path+" ; git checkout "+ds_version+ " ; popd")
            handler.set_spec_version(ds_version)
            
        except Exception as e:
            raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error.  Please contact support"%ds_version)
Ejemplo n.º 2
0
    def get_citation_url(self, project_config_section, config, dataset_name,
                         dataset_version, test_publication):
        """ Returns the citation_url if a project uses citation, otherwise returns None

         project_section
            The name of the project section in the ini file

        config
            The configuration (ini files)

        dataset_name
            Name of the dataset

        dataset_version
            Version of the dataset
        """
        if config.has_option(project_config_section, 'citation_url'):
            try:
                pattern = self.getFilters(option='dataset_id')
                attributes = re.match(pattern[0], dataset_name).groupdict()
                if 'version' not in attributes:
                    attributes['version'] = str(dataset_version)
                if 'dataset_id' not in attributes:
                    attributes['dataset_id'] = dataset_name
                return config.get(project_config_section, 'citation_url', 0,
                                  attributes)
            except:
                warning('Unable to generate a citation url for %s' %
                        dataset_name)
                return None
        else:
            return None
Ejemplo n.º 3
0
def getHandler(path, Session, validate=True, **extra_args):
    """
    Get a project handler from a file. The project is determined by trying to create each registered handler using the file.

    path
      String path of the file to read project info from.

    Session
      SQLAlchemy Session

    validate
      If True, create a validating handler which will raise an error if an invalid field value is read or input.

    any other keyword args are passed to the handler
    """

    found = False
    items = projectRegistry.items()
    items.sort(lambda x, y: cmp(projectRegistry.order(x[0]), projectRegistry.order(y[0])))
    for name, cls in items:
        try:
            handler = instantiateHandler(cls, name, path, Session, validate, **extra_args)
        except ESGInvalidMetadataFormat:
            continue
        found = True
        break
    if not found:
        warning('No project handler found for file %s'%path)
        handler = None

    return handler
Ejemplo n.º 4
0
def getHandler(path, Session, validate=True):
    """
    Get a project handler from a file. The project is determined by trying to create each registered handler using the file.

    path
      String path of the file to read project info from.

    Session
      SQLAlchemy Session

    validate
      If True, create a validating handler which will raise an error if an invalid field value is read or input.
    """

    found = False
    items = projectRegistry.items()
    items.sort(lambda x, y: cmp(projectRegistry.order(x[0]),
                                projectRegistry.order(y[0])))
    for name, cls in items:
        try:
            handler = cls(name, path, Session, validate)
        except ESGInvalidMetadataFormat:
            continue
        found = True
        break
    if not found:
        warning('No project handler found for file %s' % path)
        handler = None

    return handler
Ejemplo n.º 5
0
    def get_citation_url(self, project_config_section, config, dataset_name, dataset_version, test_publication):
        """ Returns the citation_url if a project uses citation, otherwise returns None

         project_section
            The name of the project section in the ini file

        config
            The configuration (ini files)

        dataset_name
            Name of the dataset

        dataset_version
            Version of the dataset
        """
        if config.has_option(project_config_section, 'citation_url'):
            try:
                pattern = self.getFilters(option='dataset_id')
                attributes = re.match(pattern[0], dataset_name).groupdict()
                if 'version' not in attributes:
                    attributes['version'] = str(dataset_version)
                if 'dataset_id' not in attributes:
                    attributes['dataset_id'] = dataset_name
                return config.get(project_config_section, 'citation_url', 0, attributes)
            except:
                warning('Unable to generate a citation url for %s' % dataset_name)
                return None
        else:
            return None
Ejemplo n.º 6
0
    def register(self, projectName, moduleName, className):
        try:
            __import__(moduleName)
        except:
            warning('Cannot import handler %s:%s for project %s'%(moduleName, className, projectName))

        m = sys.modules[moduleName]
        cls = m.__dict__.get(className)
        if cls is None:
            warning('No such class in %s: %s'%(moduleName, className))

        self.registry[projectName] = cls
Ejemplo n.º 7
0
def updateDatasetFromContext(context, datasetName, Session):
    """

    Update a persistent dataset with values from context (name/value dictionary). The context may have
    fields such as event fields, not associated with the project handler.

    context
      A property (name/value) dictionary.

    datasetName
      String dataset identifier.

    Session
      Database session factory.

    """

    dset = Dataset.lookup(datasetName, Session)
    if dset is None:
        raise ESGQueryError("Dataset not found: %s" % datasetName)
    projectName = dset.get_project(Session)
    handler = getHandlerByName(projectName, None, Session)
    basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields(
        handler, return_list=False)
    properties = context.copy()

    # Set basic and event properties
    session = Session()
    session.add(dset)
    for key, value in properties.items():
        if key in basicHeaders:
            if key != 'id':
                if key == 'name':
                    if len(handler.parseDatasetName(value, {})) == 0:
                        warning(
                            "Dataset name: %s does not match dataset_id pattern in config file."
                            % value)
                setattr(dset, key, value)
            else:
                warning("Cannot update id field")
            del properties[key]
        elif key in eventHeaders:
            event = dset.events[-1]
            setEvent(event, key, value)
            del properties[key]

    # Set attribute headers
    handler.setContext(properties)
    handler.saveContext(datasetName, Session)

    session.commit()
    session.close()
Ejemplo n.º 8
0
def updateDatasetFromContext(context, datasetName, Session):
    """

    Update a persistent dataset with values from context (name/value dictionary). The context may have
    fields such as event fields, not associated with the project handler.

    context
      A property (name/value) dictionary.

    datasetName
      String dataset identifier.

    Session
      Database session factory.

    """

    dset = Dataset.lookup(datasetName, Session)
    if dset is None:
        raise ESGQueryError("Dataset not found: %s" % datasetName)
    projectName = dset.get_project(Session)
    handler = getHandlerByName(projectName, None, Session)
    basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields(
        handler, return_list=False)
    properties = context.copy()

    # Set basic and event properties
    session = Session()
    session.add(dset)
    for key, value in properties.items():
        if key in basicHeaders:
            if key != 'id':
                if key == 'name':
                    if len(handler.parseDatasetName(value, {})) == 0:
                        warning(
                            "Dataset name: %s does not match dataset_id pattern in config file."
                            % value)
                setattr(dset, key, value)
            else:
                warning("Cannot update id field")
            del properties[key]
        elif key in eventHeaders:
            event = dset.events[-1]
            setEvent(event, key, value)
            del properties[key]

    # Set attribute headers
    handler.setContext(properties)
    handler.saveContext(datasetName, Session)

    session.commit()
    session.close()
Ejemplo n.º 9
0
    def register(self, projectName, moduleName, className):
        try:
            __import__(moduleName)
        except:
            warning('Cannot import handler %s:%s for project %s' %
                    (moduleName, className, projectName))

        m = sys.modules[moduleName]
        cls = m.__dict__.get(className)
        if cls is None:
            warning('No such class in %s: %s' % (moduleName, className))

        self.registry[projectName] = cls
Ejemplo n.º 10
0
    def loadEntryPoints(self):
        """
        Get the entry points for the entry point group associated with this registry,
        and build an entry point dictionary.
        """
        optionDict = {}
        distPlugins = {
        }  # distPlugins: entry_point_distribution => distribution_dict
        #   where distribution_dict: entry_point_name => handler_class

        for ep in iter_entry_points(self.entryPointGroup):
            if distPlugins.has_key(ep.dist):
                distPlugins[ep.dist][ep.name] = ep
            else:
                distPlugins[ep.dist] = {ep.name: ep}

        for dist, v in distPlugins.items():
            if v.has_key(HANDLER_NAME_ENTRY_POINT):
                if v.has_key(HANDLER_ENTRY_POINT):
                    handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[
                            handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist,
                                   handlerName, handlerClassName))
                    else:
                        optionDict[handlerName] = (v[HANDLER_ENTRY_POINT],
                                                   dist, True)
                else:
                    warning("Distribution %s does not define a %s option." %
                            (k, HANDLER_ENTRY_POINT))
            elif v.has_key(HANDLER_DICT_ENTRY_POINT):
                handlerDict = v[HANDLER_DICT_ENTRY_POINT].load()
                for handlerName, handlerClassName in handlerDict.items():
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[
                            handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist,
                                   handlerName, handlerClassName))
                    else:
                        optionDict[handlerName] = (handlerClassName, dist,
                                                   False)
        return optionDict
Ejemplo n.º 11
0
 def validateFile(self, fileobj):
     """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler."""
     if not fileobj.hasAttribute('project_id'):
         result = False
         message = "No global attribute: project_id"
     else:
         project_id = fileobj.getAttribute('project_id', None)
         result =  (project_id[:5]=="CMIP5")
         message = "project_id should be 'CMIP5'"
     if not result:
         if (WARN) :
             warning(message)
         else:
             raise ESGInvalidMetadataFormat(message)
Ejemplo n.º 12
0
 def validateFile(self, fileobj):
     """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler."""
     if not fileobj.hasAttribute('project_id'):
         result = False
         message = "No global attribute: project_id"
     else:
         project_id = fileobj.getAttribute('project_id', None)
         result = (project_id[:5] == "CMIP5")
         message = "project_id should be 'CMIP5'"
     if not result:
         if (WARN):
             warning(message)
         else:
             raise ESGInvalidMetadataFormat(message)
Ejemplo n.º 13
0
def getHessianServiceURL():
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'hessian_service_url')

    gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None)
    if gatewayServiceRoot is not None:
        dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL)
        dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot)
        if serviceHost!=envServiceHost:
            warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot))

    return serviceURL
Ejemplo n.º 14
0
def getHessianServiceURL():
    """Get the configured value of hessian_service_url"""

    config = getConfig()
    serviceURL = config.get('DEFAULT', 'hessian_service_url')

    gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None)
    if gatewayServiceRoot is not None:
        dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL)
        dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot)
        if serviceHost!=envServiceHost:
            warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot))

    return serviceURL
Ejemplo n.º 15
0
    def readContext(self, fileInstance, **kw):
        """Get a dictionary of attribute/value pairs from an open file.

        Returns a dictionary of attribute/value pairs, which are added to the handler context.

        fileInstance
          Format handler instance representing the opened file, an instance of FormatHandler
          or a subclass.

        kw
          Optional keyword arguments.

        """
        result = {}
        f = fileInstance.file

        result = IPCC5Handler.readContext(self, fileInstance, **kw)
        if 'project' not in result:
            result['project'] = self.name

        # Parse CMOR table.
        if hasattr(f, 'table_id'):
            tableId = getattr(f, 'table_id')
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields) > 1:
                table = fields[1]
                result['cmor_table'] = table
            else:
                result['cmor_table'] = 'noTable'

        # Read categories as file attributes, for values not already set
        for category in self.getFieldNames():
            if category not in result and hasattr(f, category):
                result[category] = getattr(f, category)

            # Check if mandatory fields are set
            if self.isMandatory(category) and category not in result:
                warning(
                    "Mandatory category %s not set for file %s, use -p option?"
                    % (category, fileInstance.path))

        # Check validity
        self.validateContext(result)

        # Return the attribute/value dictionary
        return result
Ejemplo n.º 16
0
    def readContext(self, fileInstance, **kw):
        """Get a dictionary of attribute/value pairs from an open file.

        Returns a dictionary of attribute/value pairs, which are added to the handler context.

        fileInstance
          Format handler instance representing the opened file, an instance of FormatHandler
          or a subclass.

        kw
          Optional keyword arguments.

        """
        result = {}
        f = fileInstance.file

        result = IPCC5Handler.readContext(self, fileInstance, **kw)
        if 'project' not in result:
            result['project'] = self.name

        # Parse CMOR table.
        if hasattr(f, 'table_id'):
            tableId = getattr(f, 'table_id')
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields)>1:
                table = fields[1]
                result['cmor_table'] = table
            else:
                result['cmor_table'] = 'noTable'

        # Read categories as file attributes, for values not already set
        for category in self.getFieldNames():
            if category not in result and hasattr(f, category):
                result[category] = getattr(f, category)

            # Check if mandatory fields are set
            if self.isMandatory(category) and category not in result:
                warning("Mandatory category %s not set for file %s, use -p option?"%(category, fileInstance.path))

        # Check validity
        self.validateContext(result)

        # Return the attribute/value dictionary
        return result
Ejemplo n.º 17
0
    def loadEntryPoints(self):
        """
        Get the entry points for the entry point group associated with this registry,
        and build an entry point dictionary.
        """
        optionDict = {}
        distPlugins = {}  # distPlugins: entry_point_distribution => distribution_dict
        #   where distribution_dict: entry_point_name => handler_class

        for ep in iter_entry_points(self.entryPointGroup):
            if distPlugins.has_key(ep.dist):
                distPlugins[ep.dist][ep.name] = ep
            else:
                distPlugins[ep.dist] = {ep.name: ep}

        for dist, v in distPlugins.items():
            if v.has_key(HANDLER_NAME_ENTRY_POINT):
                if v.has_key(HANDLER_ENTRY_POINT):
                    handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)
                            )
                    else:
                        optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True)
                else:
                    warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT))
            elif v.has_key(HANDLER_DICT_ENTRY_POINT):
                handlerDict = v[HANDLER_DICT_ENTRY_POINT].load()
                for handlerName, handlerClassName in handlerDict.items():
                    if optionDict.has_key(handlerName):
                        handlerValue = v[HANDLER_ENTRY_POINT]
                        handlerClassName, prevDist, mustload = optionDict[handlerName]
                        if handlerValue != handlerClassName:
                            error(
                                "Conflicting handler names found:\n  In distribution %s, %s => (%s);\n  In distribution %s, %s => (%s)\n  To remove the error uninstall one of the packages with 'easy_install -mxN package_name'."
                                % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)
                            )
                    else:
                        optionDict[handlerName] = (handlerClassName, dist, False)
        return optionDict
Ejemplo n.º 18
0
    def check_pid_avail(self, project_config_section, config, version=None):
        """ Returns the pid_prefix

         project_config_section
            The name of the project config section in esg.ini

        config
            The configuration (ini files)

        version
            Integer or Dict with dataset versions
        """
        # disable PIDs for local index without versioning (IPSL use case)
        if isinstance(version, int) and not version_pattern.match(str(version)):
            warning('Version %s, skipping PID generation.' % version)
            return None

        return '21.14100'
Ejemplo n.º 19
0
def getDerived(dset, dsetVersion, derivedHeaders, handler):
    result = []
    for attname in derivedHeaders:
        if attname=='version':
            value = str(dsetVersion.version)
        elif attname=='parent':
            dsetname = dset.name
            try:
                value  = handler.getParentId(dsetname)
            except:
                warning("Cannot determine parent id for dataset %s"%dsetname)
                value = ''
        elif attname=='version_name':
            value = dsetVersion.name
        elif attname=='comment':
            value = dsetVersion.comment
        result.append(value)
    return result
Ejemplo n.º 20
0
    def check_pid_avail(self, project_config_section, config, version=None):
        """ Returns the pid_prefix

         project_config_section
            The name of the project config section in esg.ini

        config
            The configuration (ini files)

        version
            Integer or Dict with dataset versions
        """
        # disable PIDs for local index without versioning (IPSL use case)
        if isinstance(version, int) and not version_pattern.match(str(version)):
            warning('Version %s, skipping PID generation.' % version)
            return None

        return '21.14100'
Ejemplo n.º 21
0
def getDerived(dset, dsetVersion, derivedHeaders, handler):
    result = []
    for attname in derivedHeaders:
        if attname == 'version':
            value = str(dsetVersion.version)
        elif attname == 'parent':
            dsetname = dset.name
            try:
                value = handler.getParentId(dsetname)
            except:
                warning("Cannot determine parent id for dataset %s" % dsetname)
                value = ''
        elif attname == 'version_name':
            value = dsetVersion.name
        elif attname == 'comment':
            value = dsetVersion.comment
        result.append(value)
    return result
Ejemplo n.º 22
0
    def readContext(self, cdfile, model=''):
        "Get a dictionary of keys from an open file"
        result = BasicHandler.readContext(self, cdfile)
        f = cdfile.file

        for key, value in cmorAttributes.items():
            try:
                result[key] = getattr(f, value)
                if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray:
                    res = str(result[key][0])
                    if key=='run_name':
                        if res[0:3]!='run':
                            res = 'run'+res
                    result[key] = res
            except:
                pass

        if 'realization' in result and 'initialization_method' in result and 'physics_version' in result:
            ensemble = 'r%si%sp%s'%(result['realization'], result['initialization_method'], result['physics_version'])
            result['ensemble'] = ensemble
            result['run_name'] = ensemble

        base = os.path.basename(cdfile.path)
        try:
            index = base.index('_')
            varname = base[0:index]
            result['variable'] = varname
        except:
            warning("File path must have the form varname_XXX: %s"%cdfile.path)

        if not result.has_key('product'):
            result['product'] = 'output'

        self.mapEnumeratedValues(result)

        # If realm has multiple fields, pick the first one
        if 'realm' in result:
            realm = result['realm'].strip()
            if realm.find(' ')!=-1:
                realms = realm.split(' ')
                result['realm'] = realms[0]

        # Parse CMOR table.
        if 'table_id' in result:
            tableId = result['table_id']
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields)>1 and (fields[1] in cmorTables):
                table = fields[1]
                result['cmor_table'] = table
            else:
                result['cmor_table'] = 'noTable'
        else:
            result['cmor_table'] = 'noTable'

        # Parse the product if it is unresolved
        if result['product']=='output':
            cmor_table = result['cmor_table']
            variable = result.get('variable', None)
            experiment = result.get('experiment', None)
            dateRange = self.getDateRangeFromPath()
            year1 = dateRange[0][0]
            year2 = dateRange[1][0]
            if year2 is None:
                year2 = year1
            result['product'] = getProduct(cmor_table, variable, experiment, year1, year2)

        validateDRSFieldValues(result, cdfile)

        return result
Ejemplo n.º 23
0
def extractFromFile(dataset, openfile, fileobj, session, cfHandler, aggdimName=None, varlocate=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname] = pattern

    # For each variable in the file:
    for varname in openfile.inquireVariableList():
        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname], os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        # Create a file variable
        filevar = FileVariable(varname, openfile.getAttribute('long_name', varname, None))
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above

    # Create global attribute
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname=='tracking_id':
            fileVersion.tracking_id = attvalue
        debug('.%s = %s'%(attname, attvalue))
    def start_harvest(self, parent):
        from esgcet.publish import publishDatasetList
        from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL

        dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue',
                                             0.8)

        # Make sure the publisher is logged in
        # if not self.parent.parent.password_flg:
        #    self.parent.parent.menu.login_menu.evt_login( self.parent.parent )

        # Start the busy routine to indicate to the users something is happening
        self.parent.parent.busyCursor = 'watch'
        self.parent.parent.busyWidgets = [
            self.parent.parent.pane2.pane('EditPaneTop'),
            self.parent.parent.pane2.pane('EditPaneBottom'),
            self.parent.parent.pane2.pane('EditPaneStatus'),
            self.parent.parent.pane.pane('ControlPane')
        ]
        pub_busy.busyStart(self.parent.parent)
        try:
            # Generate the list of datasets to be published
            datasetNames = []
            GUI_line = {}
            tab_name = self.parent.parent.top_notebook.getcurselection()
            selected_page = self.parent.parent.main_frame.selected_top_page

            if (selected_page is None):
                warning(
                    "Must generate a list of datasets to scan before publishing can occur."
                )
                pub_busy.busyEnd(self.parent.parent)
                return

            for x in self.parent.parent.main_frame.top_page_id[selected_page]:

                if self.parent.parent.main_frame.top_page_id[selected_page][x].cget(
                        'bg'
                ) != 'salmon' and self.parent.parent.main_frame.top_page_id2[
                        selected_page][x].cget('bg') != 'salmon':
                    dset_name = self.parent.parent.main_frame.top_page_id2[
                        selected_page][x].cget('text')
                    #######################################
                    # ganz added this 1/18/11
                    versionNum = self.parent.parent.main_frame.version_label[
                        selected_page][x].cget('text')
                    dsetTuple = (dset_name, versionNum)
                    #dsetName = generateDatasetVersionId(dsetTuple)
                    #####################################################################################
                    # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary
                    datasetNames.append(dsetTuple)
                    GUI_line[dset_name] = x
                else:
                    if self.parent.parent.main_frame.top_page_id2[
                            selected_page][x].cget('bg') == 'salmon':
                        self.parent.parent.main_frame.top_page_id[
                            selected_page][x].configure(relief='raised',
                                                        background='salmon',
                                                        image=self.off)

        # Publish collection of datasets
            testProgress = (self.parent.parent.statusbar.show, 0, 100)
            publishThredds = (quality_control_widgets.get_CheckBox3() == 1)
            publishGateway = (quality_control_widgets.get_CheckBox2() == 1)
            if (publishThredds):
                print 'publishing to Thredds'
            if (publishGateway):
                print 'publishing to Gateway'

            status_dict = publishDatasetList(datasetNames,
                                             self.Session,
                                             publish=publishGateway,
                                             thredds=publishThredds,
                                             progressCallback=testProgress)

            # Show the published status
            for x in status_dict.keys():
                status = status_dict[x]
                dsetName, versionNo = x
                dsetVersionName = generateDatasetVersionId(x)
                guiLine = GUI_line[dsetName]  # dsetVersionName]

                self.parent.parent.main_frame.status_label[selected_page][
                    guiLine].configure(
                        text=pub_controls.return_status_text(status))
                dset = Dataset.lookup(dsetName, self.Session)
                if dset.has_warnings(self.Session):
                    warningLevel = dset.get_max_warning_level(self.Session)
                    if warningLevel >= ERROR_LEVEL:
                        buttonColor = "pink"
                        buttonText = "Error"
                    else:
                        buttonColor = "yellow"
                        buttonText = "Warning"
                    self.parent.parent.main_frame.ok_err[selected_page][
                        guiLine].configure(
                            text=buttonText,
                            bg=buttonColor,
                            relief='raised',
                            command=pub_controls.Command(
                                self.parent.parent.pub_buttonexpansion.
                                extraction_widgets.error_extraction_button,
                                dset))
                else:
                    self.parent.parent.main_frame.ok_err[selected_page][
                        guiLine].configure(
                            text='Ok',
                            bg=dcolor1,
                            highlightcolor=dcolor1,
                            relief='sunken',
                        )
        except:
            pub_busy.busyEnd(
                self.parent.parent
            )  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
            pub_busy.busyEnd(self.parent.parent)
            self.my_refresh()
Ejemplo n.º 25
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        return

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_file = None
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:
            if len(lastargs)==0:
                print "No directories specified."
                return

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
            datasetNames = [(item,-1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s"%projectName, service)
            offlineLister = config.get(listerSection, 'offline_lister_executable')
            commandArgs = "--config-section %s "%listerSection
            commandArgs += " ".join(lastargs)
            for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName,-1)):
                    dmap[(dsetName,-1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName,-1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames)==0:
        warning("No datasets found.")
        min_version = -1
    else:
        min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

    # Must specify version for replications
    if min_version == -1 and masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")
    
    # Iterate over datasets
    if not publishOnly:

#        pdb.set_trace()

        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    if (not nodbwrite):
        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)
    # print `result`

    if summarizeErrors:
        print 'Summary of errors:'
        for name,versionno in datasetNames:
            dset = Dataset.lookup(name, Session)
            print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
            if dset.has_warnings(Session):
                print '=== Dataset: %s ==='%dset.name
                for line in dset.get_warnings(Session):
                    print line
Ejemplo n.º 26
0
    def generateDirectoryMap(self,
                             directoryList,
                             filefilt,
                             initContext=None,
                             datasetName=None,
                             use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters

        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:' + self.name
            dataset_id_formats = splitLine(
                config.get(section, 'dataset_id', raw=True))
            idfields = [
                re.findall(_patpat, format) for format in dataset_id_formats
            ]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1] == '/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId(
                            'dataset_id',
                            idfields,
                            groupdict,
                            multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$',
                                            drsversion[0]):  # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s' % drsversion
                    except:
                        allfields = reduce(lambda x, y: set(x) + set(y),
                                           idfields)
                        missingFields = list((set(allfields) -
                                              set(groupdict.keys())) -
                                             set(config.options(section)))
                        raise ESGPublishError(
                            "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"
                            % ( ` missingFields `, nodepath))
                else:
                    warning(
                        "Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini"
                    )
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0):
            warning(
                "Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini"
            )
        return datasetMap
Ejemplo n.º 27
0
    def new_query_page(self, parent, tab_name=None, query_id=None):
        # Start the busy routine to indicate to the users something is happening

        self.parent.parent.busyCursor = "watch"
        self.parent.parent.busyWidgets = [
            self.parent.parent.pane2.pane("EditPaneTop"),
            self.parent.parent.pane2.pane("EditPaneBottom"),
            self.parent.parent.pane2.pane("EditPaneStatus"),
            self.parent.parent.pane.pane("ControlPane"),
        ]
        pub_busy.busyStart(self.parent.parent)

        try:
            properties = {}
            projectName = self.parent.query_fields["project"].get()  # Must have projectName
            handler = getHandlerByName(projectName, None, self.Session)
            tabcolor = Pmw.Color.changebrightness(self.parent.parent, pub_controls.query_tab_color, 0.6)

            # works up to here

            if query_id is None:
                for x in self.parent.query_fields.keys():
                    query_string = self.parent.query_fields[x].get().lstrip()
                    if (query_string == "-Any-") or (len(query_string) == 0):
                        properties[x] = (2, "%")
                    elif query_string != "-Any-":
                        properties[x] = (1, query_string)

                if properties["id"] == (2, "%"):
                    del properties["id"]  # This causes an error because you cannot modify the 'id'

                listProperties = False

                result, headers = queryDatasets(projectName, handler, self.Session, properties)
                # works up to here

                # running this causes it to fail!
                self.new_page(
                    parent,
                    tabName=None,
                    tab_color=tabcolor,
                    page_type="query",
                    query_result=result,
                    list_fields=headers,
                )

            else:
                result, headers = queryDatasets(projectName, handler, self.Session, properties)
                for x in result:
                    query_id_found = False
                    if query_id == x[0][:-1]:
                        self.new_page(
                            parent,
                            tabName=None,
                            tab_color=tabcolor,
                            page_type="query",
                            query_result=[x],
                            list_fields=headers,
                        )
                        query_id_found = True
                        break
                if query_id_found is False:
                    warning("The specified dataset id '%s' was not found.", query_id)

            # fails here

            # Enable the "Data Publication" button
            self.parent.ControlButton3.configure(state="normal")

            datasetNames = []
            for x in result:
                datasetNames.append(x[1])
            dmap, offline_map, extraFields = queryDatasetMap(datasetNames, self.Session, extra_fields=True)
            # Check if offline or not, then set the iteration values for each page

            selected_page = self.parent.parent.main_frame.selected_top_page
            self.parent.parent.hold_offline[selected_page] = offline_map
            self.parent.parent.main_frame.projectName[selected_page] = projectName
            self.parent.parent.main_frame.dmap[selected_page] = dmap
            self.parent.parent.main_frame.extraFields[selected_page] = extraFields
            self.parent.parent.main_frame.datasetMapfile[selected_page] = None
            self.parent.parent.directoryMap[selected_page] = None
            self.parent.parent.main_frame.dirp_firstfile[selected_page] = None
            self.parent.parent.defaultGlobalValues[selected_page] = {}

        except:
            pub_busy.busyEnd(self.parent.parent)  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
            pub_busy.busyEnd(self.parent.parent)
Ejemplo n.º 28
0
    def return_content2(self, appendOpt=False):
        from esgcet.publish import iterateOverDatasets, processIterator
        from esgcet.config import getHandlerByName
        from esgcet.model import eventName
        from esgcet.config import loadConfig

        # Initialize parameters for interating over datasets
        initcontext = {}
        aggregateOnly = False
        # appendOpt = False
        initcontext = {}
        properties = {}
        publish = False
        publishOnly = False
        thredds = False
        testProgress1 = [self.parent.parent.statusbar.show, 0, 50]
        testProgress2 = [self.parent.parent.statusbar.show, 50, 100]
        handlerDictionary = {}

        # Get the currently selected tab and the selected datasets
        tab_name = self.parent.parent.top_notebook.getcurselection()
        selected_page = self.parent.parent.main_frame.selected_top_page
        datasetNames = []
       # datasetNames2 = []
        if (selected_page is None):
           warning("Must generate a list of datasets to scan before data extraction can occur.")
           return

        if (selected_page is not None) or (self.parent.parent.hold_offline[selected_page] == True):
           extraFields = None 
           if (self.parent.parent.hold_offline[selected_page] == False) or (isinstance(self.parent.parent.hold_offline[selected_page], types.DictType)):
              for x in self.parent.parent.main_frame.top_page_id[selected_page]:
                dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') # GANZ TODO version_label
                
                   # ganz added this 1/21/11
                if (self.parent.parent.main_frame.version_label[selected_page] ):
                    dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')               
                    dsetVersion = self.parent.parent.main_frame.version_label[selected_page][x].cget('text')                 
                  #####################################################################################               
                else:
                    dset_name, dsetVersion = parseDatasetVersionId(dsetVersionName)

                # Retrieve all the datasets in the collection for display
                """ ganz test code
                status = pollDatasetPublicationStatus(dset_name, self.Session)
                status_text = pub_controls.return_status_text( status )
                if status_text != 'Error':
                   dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text'))
                   datasetNames2.append(dsetTuple)
                """
                # Retrieve only the datasets that have been selected
                if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon':
                   dsetTuple =  parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text'))
                   datasetNames.append(dsetTuple)

              dmap = self.parent.parent.main_frame.dmap[selected_page]
              extraFields = self.parent.parent.main_frame.extraFields[selected_page]
              datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page]
              projectName = self.parent.parent.main_frame.projectName[selected_page]
              directoryMap = self.parent.parent.directoryMap[selected_page]

              if dmap is not None:
                 for x in datasetNames:
                    dsetId = x[0] 
                    datasetName = x
                    try:
                        dmapentry = dmap[datasetName]
                    except:

                        # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version).
                        # If so, replace the entry with the new key.
                        trykey = (datasetName[0], -1)
                        dmapentry = dmap[trykey]
                        del dmap[trykey]
                        dmap[datasetName] = dmapentry
                    firstFile = dmapentry[0][0]
  
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session)
                    handler = self.parent.parent.handlerDictionary[dsetId]
                 # Copy the defaultGlobalValues into initcontext
                 initcontext = self.parent.parent.main_frame.defaultGlobalValues[selected_page]
              else:
                  # more test code
                 myholdDirectoryMap = self.parent.parent.directoryMap[selected_page] 
                 #mydatasetNames = [(item,-1) for item in myholdDirectoryMap.keys()]
                 mydatasetNames = [(item) for item in myholdDirectoryMap.keys()]
                 #end
                 for x in mydatasetNames:
                    dsetId = x[0] 
                    datasetName = x
                    # ganz this is test code
                    try:
                        dmapentry = myholdDirectoryMap[datasetName]
                    except:

                        # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version).
                        # If so, replace the entry with the new key.
                        
                        trykey = (datasetName[0], -1)
                        dmapentry = myholdDirectoryMap[trykey]
                        del myholdDirectoryMap[trykey]
                        myholdDirectoryMap[datasetName] = dmapentry
                        
                    firstFile = dmapentry[0][1]
                    #end of test code
                    
                    #firstFile = self.parent.parent.main_frame.dirp_firstfile[selected_page]
 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session)
                    handler = self.parent.parent.handlerDictionary[dsetId]
           else:      # working off-line
              projectName = self.parent.parent.main_frame.projectName[selected_page]
              if self.parent.parent.offline_file_directory[selected_page] == "directory":
                 if self.parent.parent.config is None:
                    extraction_controls.call_sessionmaker( self.parent.parent )
                 datasetPaths = []
                 dmap = {self.parent.parent.offline_datasetName : datasetPaths}
                 listerSection = getOfflineLister(self.parent.parent.config, "project:%s"%projectName, None)
                 offlineLister = self.parent.parent.config.get(listerSection, 'offline_lister_executable')
                 lastargs = self.parent.parent.offline_directories
                 commandArgs = "--config-section %s "%listerSection
                 commandArgs += " ".join(lastargs)
                 for filepath, size in processIterator(offlineLister, commandArgs, filefilt=self.parent.parent.filefilt):
                   datasetPaths.append((filepath, str(size)))
                 datasetNames = self.parent.parent.datasetNames
                 directoryMap = None

                 # get the handler
                 for x in datasetNames:
                    dsetId = x[0] 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True)

              elif self.parent.parent.offline_file_directory[selected_page] == "file":
                 dmap = self.parent.parent.main_frame.dmap[selected_page]
                 extraFields = self.parent.parent.main_frame.extraFields[selected_page]
                 datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page]
                 projectName = self.parent.parent.main_frame.projectName[selected_page]
                 directoryMap = None
                 if datasetMapfile is not None:
                     dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
                     datasetNames = dmap.keys()

                 # get the handlers
                 for x in datasetNames:
                    dsetId = x[0] 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True)


           # Iterate over datasets
           if appendOpt:
               operation = UPDATE_OP
           else:
               operation = CREATE_OP
        
           datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, self.Session, self.parent.parent.aggregateDimension, operation, self.parent.parent.filefilt, initcontext, self.parent.parent.hold_offline[selected_page], properties, comment=self.comments, testProgress1=testProgress1, testProgress2=testProgress2 , handlerDictionary=self.parent.parent.handlerDictionary, extraFields=extraFields, readFiles=True)

           # If working on-line then replace the scanned list of datasets with 
           # the complete list of datasets
           #test
           """
           print 'datasetNames:'
           for t1 in datasetNames:
               print t1
           print 'datasetNames2:'    
           for t2 in datasetNames2:
               print t2
           """   
           if not self.parent.parent.hold_offline[selected_page]:
              datasets = []
              versionObjs = []
              # ganz finally, tested datasetNames2 here
              for dsetName, version in datasetNames:
                  result = Dataset.lookup(dsetName, self.Session, version=version)
                  if result is not None:
                      entry, versionObj = result
                      datasets.append(entry)
                      versionObjs.append(versionObj)

           # Get the summary of errors after doing a data extraction
           dset_error = []
           for dset in datasets:
               status = dset.get_publication_status(self.Session)
               status_name = eventName[status]
               if dset.has_warnings(self.Session):
                   dset_error.append(dset.get_name(self.Session))

           try:
              list_fields = getQueryFields( handler )
           except:
              handler = getHandlerByName(projectName, None, self.Session)
              list_fields = getQueryFields( handler )

           # Display the datasets in the "Collection" page
#           if self.parent.parent.hold_offline[selected_page] == True:
#              tab_name = "Collection_Offline"
#              from_tab = "Collection"
#              pub_editorviewer = self.parent.parent.create_publisher_editor_viewer( self.parent.parent, tab_name, dataset, from_tab, self.Session)

           # Show the extracted datasets
           self.set_column_labels( len(datasets), list_fields )
           self.show_extracted_info(datasets, dset_error, list_fields, versionObjs)

        # Enable the "Data Publication" button
        self.parent.ControlButton3.configure( state = 'normal' )
Ejemplo n.º 29
0
def getProduct(cmor_table, variable, experiment, year1, year2):
    """Get the DRS product value associated with the file.
    Returns
      'output1' for datasets to be replicated,
      'output2' for datasets outside the replicated datasets,
      'output' if the product cannot be determined.
    """
    cmor_table = cmor_table.lower()
    variable = variable.lower()

    # decadal1960, decadal1980, decadal2005 => decadal_30
    # Other decadal experiments => decadal_10

    if experiment is None and WARN:
        warning("Found empty experiment field")
        base_year = None
    else:
        if experiment[0:7]=='decadal':
            fullexperiment = experiment
            if experiment in ['decadal1960', 'decadal1980', 'decadal2005']:
                experiment = 'decadal_30'
            else:
                experiment = 'decadal_10'
            try:
                base_year = int(fullexperiment[7:11])
            except:
                base_year = 0
        else:
            base_year = None

    # If the variable is not in the request list, => output2
    vardict = cmor_variables.get(cmor_table, None)
    reqdict = requested_time.get(cmor_table, None)

    # If the CMOR table or variable are unknown, don't even try
    if vardict is None or variable is None:
        result = 'output'

    # Check for variables outside the request list
    elif variable not in vardict:
        result = 'output2'

    # CMOR table == 'day'
    elif cmor_table == 'day':
        if variable in ['huss', 'omldamax', 'pr', 'psl', 'sfcwind', 'tas', 'tasmax', 'tasmin', 'tos', 'tossq']:
            result = 'output1'
        else:
            result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2)

    # CMOR table == 'Oyr'
    elif cmor_table == 'oyr':
        priority, dimensions = vardict[variable]
        if priority in [1,2]:
            result = 'output1'
        else:
            result = 'output2'

    # CMOR table == 'Omon'
    elif cmor_table == 'omon':
        priority, dimensions = vardict[variable]
        if 'basin' in dimensions:
            result = 'output1'
        elif 'olevel' in dimensions and priority>1:
            result = 'output2'
        else:
            result = 'output1'

    # CMOR table == 'aero'
    elif cmor_table == 'aero':
        priority, dimensions = vardict[variable]
        if 'alevel' not in dimensions:
            result = 'output1'
        else:
            result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2, base_year=base_year)

    # CMOR table == '6hrPlev', '3hr', 'cfMon', 'cfOff'
    elif cmor_table in ['6hrplev', '3hr', 'cfmon', 'cfoff']:
        result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2)

    # Otherwise => output1
    else:
        result = 'output1'

    return result
Ejemplo n.º 30
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)
        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.
        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate:
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Ejemplo n.º 31
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset)
                    except RemoteCallException, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = `e`.split('\n')
                        error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s"%stateName)
                    resultDict[datasetName] = eventName
            else:                       # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset)
                except RemoteCallException, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = `e`.split('\n')
                    error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n')))
                    continue
Ejemplo n.º 32
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", '.*\.nc$')
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError(
            "Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError(
            "Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file,
                             echoSql=echoSql,
                             log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile,
                                           parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList,
                                                   filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError(
                        "No project found in file %s, specify with --project."
                        % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
            dmap = {}
            listerSection = getOfflineLister(config,
                                             "project:%s" % projectName,
                                             service)
            offlineLister = config.get(listerSection,
                                       'offline_lister_executable')
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                    offlineLister,
                    commandArgs,
                    handler,
                    filefilt=filefilt,
                    datasetName=datasetName,
                    offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(projectName,
                                       dmap,
                                       directoryMap,
                                       datasetNames,
                                       Session,
                                       aggregateDimension,
                                       publishOp,
                                       filefilt,
                                       initcontext,
                                       offline,
                                       properties,
                                       keepVersion=keepVersion,
                                       newVersion=version,
                                       extraFields=extraFields,
                                       masterGateway=masterGateway,
                                       comment=message,
                                       readFiles=readFiles)

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog)

    return result
Ejemplo n.º 33
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds]:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Ejemplo n.º 34
0
def extractFromFile(dataset, openfile, fileobj, session, handler, cfHandler, aggdimName=None, varlocate=None, exclude_variables=None, perVariable=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    handler
      Project handler

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    exclude_variables
        List of thredds_exclude_variables

    perVariable
        Boolean, Try to find a target_variable if true and extract all variables if false

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname.strip()] = pattern.strip()

    # Create global attribute
    target_variable = None
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname == 'tracking_id':
            fileVersion.tracking_id = attvalue
        # extract target_variable from global attributes
        if attname == 'variable_id' and perVariable:
            target_variable = attvalue
            debug('Extracted target variable from global attributes: %s' % target_variable)
        debug('.%s = %s' % (attname, attvalue))

    # try to get target_variable from DRS if not found in global attributes
    if not target_variable and perVariable:
        config = getConfig()
        if config is not None:
            drs_pattern = handler.getFilters()[0][1:-1]
            drs_file_pattern = '%s/(?P<filename>[\w.-]+)$' % drs_pattern
            drs_parts = re.search(drs_file_pattern, openfile.path).groupdict()
            if 'variable' in drs_parts:
                target_variable = drs_parts['variable']
                debug('Extracted target variable from DRS: %s' % target_variable)

    # target_variable must be present in the file
    if target_variable not in openfile.inquireVariableList():
        target_variable = None

    # For each variable in the file:
    for varname in openfile.inquireVariableList():

        # we need to extract only target, aggregation and coverage variables
        if target_variable:
            is_coverage_variable = check_coverage_variable(varname, openfile)
            if not is_coverage_variable and varname != target_variable and varname != aggdimName:
                debug("Skipping variable %s in %s (not target (%s), coverage or aggregation (%s) variable)" % (varname, fileVersion.location, target_variable, aggdimName))
                continue

        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname].strip(), os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        is_target_variable = True
        if target_variable and target_variable != varname:
            is_target_variable = False
        elif varname in exclude_variables:
            is_target_variable = False

        # Create a file variable
        varstr = openfile.getAttribute('long_name', varname, None)
        
        if not varstr is None and len(varstr) > 255:
            varstr = varstr[0:255]
        filevar = FileVariable(varname, varstr, is_target_variable=is_target_variable)
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            if var0 is None:
                continue
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above
Ejemplo n.º 35
0
    def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters
        
        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:'+self.name
            dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True))
            idfields = [re.findall(_patpat, format) for format in dataset_id_formats]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1]=='/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s'%drsversion
                    except:
                        allfields = reduce(lambda x,y: set(x)+set(y), idfields)
                        missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section)))
                        raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath))
                else:
                    warning("Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini")
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0 ):
            warning("Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini")
        return datasetMap
Ejemplo n.º 36
0
def deleteDatasetList(datasetNames,
                      Session,
                      gatewayOperation=UNPUBLISH,
                      thredds=True,
                      las=False,
                      deleteInDatabase=False,
                      progressCallback=None,
                      deleteAll=False,
                      republish=False,
                      restInterface=False):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    """

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d" %
                              gatewayOperation)
    deleteOnGateway = (gatewayOperation == DELETE)
    operation = (gatewayOperation != NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName, version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(
            datasetName,
            version,
            session,
            deleteAll=deleteAll,
            restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s" % datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL()
            servicePort = config.getint('DEFAULT', 'hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT',
                                             'hessian_service_debug')
            service = Hessian(serviceURL,
                              servicePort,
                              key_file=serviceKeyfile,
                              cert_file=serviceCertfile,
                              debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL()
            serviceDebug = config.getboolean('DEFAULT',
                                             'rest_service_debug',
                                             default=False)
            service = RestPublicationService(serviceURL,
                                             serviceCertfile,
                                             keyFile=serviceKeyfile,
                                             debug=serviceDebug)

        for datasetName, version in datasetNames:
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            if (not DELETE_AT_DATASET_LEVEL) and (dset is not None):
                for versionObj in versionObjs:
                    try:
                        eventName, stateName = deleteGatewayDatasetVersion(
                            versionObj.name,
                            gatewayOperation,
                            service,
                            session,
                            dset=dset)
                    except RemoteCallException, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[0:2], '\n')))
                        continue
                    except ESGPublishError, e:
                        fields = ` e `.split('\n')
                        error(
                            "Deletion/retraction failed for dataset/version %s with message: %s"
                            % (datasetName, string.join(fields[-2:], '\n')))
                        continue
                    info("  Result: %s" % stateName)
                    resultDict[datasetName] = eventName
            else:  # Nothing in the node database, but still try to delete on the gateway
                if DELETE_AT_DATASET_LEVEL and (dset is not None) and (
                        not restInterface):
                    datasetName = dset.name
                try:
                    eventName, stateName = deleteGatewayDatasetVersion(
                        datasetName,
                        gatewayOperation,
                        service,
                        session,
                        dset=dset)
                except RemoteCallException, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[0:2], '\n')))
                    continue
                except ESGPublishError, e:
                    fields = ` e `.split('\n')
                    error(
                        "Deletion/retraction failed for dataset/version %s with message: %s"
                        % (datasetName, string.join(fields[-2:], '\n')))
                    continue
Ejemplo n.º 37
0
def checkAndUpdateRepo(cmor_table_path, ds_version):
    """
        Checks for a file written to a predefined location.  if not present or too old, will pull the repo based on the input path argument and update the timestamp.
    """
    # This is run during handler initialization and not for each file validation

    # Pull repo if fetched more than one day ago
    # or if never fetched before
    if os.path.exists(UPDATE_TIMESTAMP):
        mtime = os.path.getmtime(UPDATE_TIMESTAMP)
        now = time()
        if now - mtime > (86400.0):
            pull_cmor_repo = True
        else:
            pull_cmor_repo = False
    else:
        pull_cmor_repo = True

    if pull_cmor_repo:
        try:
            # Go into CMOR table path
            # Git fetch CMOR table repo
            # Go back to previous working directory
            checkedRun(('cd {} && git fetch --quiet'
                        ).format(cmor_table_path))
            # Update local timestamp
            f = open(UPDATE_TIMESTAMP, "w")
            f.write("CMOR table updated at {}".format(time()))
            f.close()
            debug("Local CMOR table repository fetched or updated")
        except Exception as e :
            warning("Attempt to update the cmor table repo and encountered an error: " + str(e))

    # Change repo branch in any case
    try:
        # Go into CMOR table path
        # Stash any changes from previous checkout 
        # Checkout to the appropriate CMOR table tag
        # Go back to previous working directory
        checkedRun(('cd {} && git stash --quiet && git checkout {} --quiet'
                    ).format(cmor_table_path, ds_version))
        # Update local timestamp
        f = open(UPDATE_TIMESTAMP, "w")
        f.write("CMOR table updated at {}".format(time()))
        f.close()
        debug("Consider CMOR table tag: {}".format(ds_version))
    except Exception as e:
        raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error.  Please contact support"%ds_version)

    # Get most up to date CMIP6_CV in any case
    if ds_version != "master":
        try:
            # Go into CMOR table path
            # PrePARE requires to have the most up to date CMIP6 CV.
            # Update CMIP6_CV.json from master branch.
            # Go back to previous working directory
            checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet'
                        ).format(cmor_table_path))
            debug("CMIP6 CV updated from master")
        except Exception as e:
            raise ESGPublishError("Master branch does not exists or CMIP6_CV.json not found or other error.  Please contact support" % ds_version)
Ejemplo n.º 38
0
def getProduct(cmor_table, variable, experiment, year1, year2):
    """Get the DRS product value associated with the file.
    Returns
      'output1' for datasets to be replicated,
      'output2' for datasets outside the replicated datasets,
      'output' if the product cannot be determined.
    """
    cmor_table = cmor_table.lower()
    variable = variable.lower()

    # decadal1960, decadal1980, decadal2005 => decadal_30
    # Other decadal experiments => decadal_10

    if experiment is None and WARN:
        warning("Found empty experiment field")
        base_year = None
    else:
        if experiment[0:7] == 'decadal':
            fullexperiment = experiment
            if experiment in ['decadal1960', 'decadal1980', 'decadal2005']:
                experiment = 'decadal_30'
            else:
                experiment = 'decadal_10'
            try:
                base_year = int(fullexperiment[7:11])
            except:
                base_year = 0
        else:
            base_year = None

    # If the variable is not in the request list, => output2
    vardict = cmor_variables.get(cmor_table, None)
    reqdict = requested_time.get(cmor_table, None)

    # If the CMOR table or variable are unknown, don't even try
    if vardict is None or variable is None:
        result = 'output'

    # Check for variables outside the request list
    elif variable not in vardict:
        result = 'output2'

    # CMOR table == 'day'
    elif cmor_table == 'day':
        if variable in [
                'huss', 'omldamax', 'pr', 'psl', 'sfcwind', 'tas', 'tasmax',
                'tasmin', 'tos', 'tossq'
        ]:
            result = 'output1'
        else:
            result = getTimeDependentProduct(cmor_table, variable, experiment,
                                             reqdict, year1, year2)

    # CMOR table == 'Oyr'
    elif cmor_table == 'oyr':
        priority, dimensions = vardict[variable]
        if priority in [1, 2]:
            result = 'output1'
        else:
            result = 'output2'

    # CMOR table == 'Omon'
    elif cmor_table == 'omon':
        priority, dimensions = vardict[variable]
        if 'basin' in dimensions:
            result = 'output1'
        elif 'olevel' in dimensions and priority > 1:
            result = 'output2'
        else:
            result = 'output1'

    # CMOR table == 'aero'
    elif cmor_table == 'aero':
        priority, dimensions = vardict[variable]
        if 'alevel' not in dimensions:
            result = 'output1'
        else:
            result = getTimeDependentProduct(cmor_table,
                                             variable,
                                             experiment,
                                             reqdict,
                                             year1,
                                             year2,
                                             base_year=base_year)

    # CMOR table == '6hrPlev', '3hr', 'cfMon', 'cfOff'
    elif cmor_table in ['6hrplev', '3hr', 'cfmon', 'cfoff']:
        result = getTimeDependentProduct(cmor_table, variable, experiment,
                                         reqdict, year1, year2)

    # Otherwise => output1
    else:
        result = 'output1'

    return result
Ejemplo n.º 39
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:' + self.name
        datasetIdFormatList = config.get(section,
                                         'dataset_id',
                                         raw=True,
                                         default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())

            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions(
                'experiment', experimentOptions)
            if idFormat.find(
                    '%(experiment)s') != -1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x, y: x + '|' + y,
                                      experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)' % optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)',
                                              experimentPattern)

            if newinit[-1] != '$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value != context[key]:
                    warning("Dataset ID=%s, but %s=%s" %
                            (datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning(
                "Dataset ID: %s does not match the dataset_id format(s): %s" %
                (datasetName, ` datasetIdFormats `))

        return context
Ejemplo n.º 40
0
   def evt_remove_dataset(self, parent):
      from esgcet.publish import pollDatasetPublicationStatus

      # Start the busy routine to indicate to the users something is happening
      parent.busyCursor = 'watch'
      parent.busyWidgets = [parent.pane2.pane( 'EditPaneTop' ), parent.pane2.pane( 'EditPaneBottom' ), parent.pane2.pane( 'EditPaneStatus' ), parent.pane.pane( 'ControlPane' )]
      pub_busy.busyStart( parent )

      datasetNames = []
      GUI_line = {}
      DELETE = 1
      #UNPUBLISH = 2
      NO_OPERATION = 3
      DeleteLocalDB = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox1() #   DeleteLocalDB 
      DeleteGateway = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox2() #   DeleteGateway
      DeleteThredds = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox3() #   DeleteThredds


      selected_page = parent.main_frame.selected_top_page
      if selected_page is not None:
         tab_name = parent.top_notebook.getcurselection()
         for x in parent.main_frame.top_page_id[selected_page]:
            if parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon' and parent.main_frame.top_page_id2[selected_page][x].cget('bg') != 'salmon':
                dset_name = parent.main_frame.top_page_id2[selected_page][x].cget('text')
                               
                #dsetVersionName1 = self.parent.parent.main_frame.top_page_id2v[selected_page][x].cget('text')
                #query_name, dset_version = parseDatasetVersionId(dsetVersionName1)
                """ ganz I am modifying this so that if a user selects a dataset without a version then we delete all versions of that dataset"""
                try:
                    dset_version = parent.main_frame.version_label[selected_page][x].cget('text')
                except:
                    dset_version = -1
                    #print 'Delete all versions'   
                #dset_version = 1
                if (dset_version == 'N/A' or not dset_version):
                    dset_version = -1
                    # continue   # not published, yet
                # Only delete published events
                status = pollDatasetPublicationStatus(dset_name, self.Session)
                if status == 3  or DeleteGateway or DeleteThredds or DeleteLocalDB:
                   #datasetNames.append(generateDatasetVersionId((dset_name, dset_version)))   
                   datasetNames.append([dset_name, dset_version])   # ganz create name/version to delete                 
                else:
                   parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off)
                GUI_line[ dset_name ] = x
            else:
                if parent.main_frame.top_page_id2[selected_page][x].cget('bg') == 'salmon':
                   parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off)
      else:
         warning("%d: No pages generated for selection. Remove is only used to remove datasets from the Publisher." % logging.WARNING)

      # Remove dataset from the gateway, etc.
      if ((DeleteGateway==0 or DeleteThredds==0) and DeleteLocalDB==1) :
          ans = self.warn_On_Removal()
          if (ans == FALSE):
              return
      
      if DeleteGateway==1:
          gatewayOp = DELETE
      else:
          gatewayOp = NO_OPERATION
    # now decide if there is anything to do
      if (gatewayOp==1 or DeleteThredds==1 or DeleteLocalDB==1) :   
          las=False
          thredds = (DeleteThredds==1)              
          deleteDset = (DeleteLocalDB==1)
              
          testProgress = (parent.parent.statusbar.show, 0, 100)
          status_dict = deleteDatasetList(datasetNames, self.Session, gatewayOp, thredds, las, deleteDset, progressCallback=testProgress)


      # Show the published status
      try:
         for x in status_dict.keys():
            status = status_dict[ x ]
            parent.main_frame.status_label[selected_page][GUI_line[x]].configure(text=pub_controls.return_status_text( status) )
      except:
         pass

      pub_busy.busyEnd( parent )
      # ganz refresh [if there were no exceptions] dataset list after deletions 
      parent.pub_buttonexpansion.query_widgets.parent.parent.ntk.evt_refresh_list_of_datasets(selected_page )
Ejemplo n.º 41
0
def checkAndUpdateRepo(cmor_table_path, ds_version):
    """
        Checks for a file written to a predefined location.  if not present or too old, will pull the repo based on the input path argument and update the timestamp.
    """
    # This is run during handler initialization and not for each file validation

    # Pull repo if fetched more than one day ago
    # or if never fetched before
    if os.path.exists(UPDATE_TIMESTAMP):
        mtime = os.path.getmtime(UPDATE_TIMESTAMP)
        now = time()
        if now - mtime > (86400.0):
            pull_cmor_repo = True
        else:
            pull_cmor_repo = False
    else:
        pull_cmor_repo = True

    if pull_cmor_repo:
        try:
            # Go into CMOR table path
            # Git fetch CMOR table repo
            # Go back to previous working directory
            checkedRun(('cd {} && git fetch --quiet').format(cmor_table_path))
            # Update local timestamp
            f = open(UPDATE_TIMESTAMP, "w")
            f.write("CMOR table updated at {}".format(time()))
            f.close()
            debug("Local CMOR table repository fetched or updated")
        except Exception as e:
            warning(
                "Attempt to update the cmor table repo and encountered an error: "
                + str(e))

    # Change repo branch in any case
    try:
        # Go into CMOR table path
        # Stash any changes from previous checkout
        # Checkout to the appropriate CMOR table tag
        # Go back to previous working directory
        checkedRun(
            ('cd {} && git stash --quiet && git checkout {} --quiet').format(
                cmor_table_path, ds_version))
        # Update local timestamp
        f = open(UPDATE_TIMESTAMP, "w")
        f.write("CMOR table updated at {}".format(time()))
        f.close()
        debug("Consider CMOR table tag: {}".format(ds_version))
    except Exception as e:
        raise ESGPublishError(
            "Error data_specs_version tag %s not found in the CMOR tables or other error.  Please contact support"
            % ds_version)

    # Get most up to date CMIP6_CV in any case
    if ds_version != "master":
        try:
            # Go into CMOR table path
            # PrePARE requires to have the most up to date CMIP6 CV.
            # Update CMIP6_CV.json from master branch.
            # Go back to previous working directory
            checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet'
                        ).format(cmor_table_path))
            debug("CMIP6 CV updated from master")
        except Exception as e:
            raise ESGPublishError(
                "Master branch does not exists or CMIP6_CV.json not found or other error.  Please contact support"
                % ds_version)
Ejemplo n.º 42
0
    def readContext(self, cdfile, model=""):
        "Get a dictionary of keys from an open file"
        result = BasicHandler.readContext(self, cdfile)
        f = cdfile.file

        for key, value in cmorAttributes.items():
            try:
                result[key] = getattr(f, value)
                if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray:
                    res = str(result[key][0])
                    if key == "run_name":
                        if res[0:3] != "run":
                            res = "run" + res
                    result[key] = res
            except:
                pass

        if "realization" in result and "initialization_method" in result and "physics_version" in result:
            ensemble = "r%si%sp%s" % (result["realization"], result["initialization_method"], result["physics_version"])
            result["ensemble"] = ensemble
            result["run_name"] = ensemble

        base = os.path.basename(cdfile.path)
        try:
            index = base.index("_")
            varname = base[0:index]
            result["variable"] = varname
        except:
            warning("File path must have the form varname_XXX: %s" % cdfile.path)

        #!WARNING: All IPSL-LUCID data goes into output
        result["product"] = "output"

        self.mapEnumeratedValues(result)

        # If realm has multiple fields, pick the first one
        if "realm" in result:
            realm = result["realm"].strip()
            if realm.find(" ") != -1:
                realms = realm.split(" ")
                result["realm"] = realms[0]

        # Parse CMOR table.
        if "table_id" in result:
            tableId = result["table_id"]
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields) > 1 and (fields[1] in cmorTables):
                table = fields[1]
                result["cmor_table"] = table
            else:
                result["cmor_table"] = "noTable"
        else:
            result["cmor_table"] = "noTable"

        # Cache a 'drs_id' attribute for DRS-style dataset lookups
        validateDRSFieldValues(result, cdfile)
        if (
            "product" in result
            and "institute" in result
            and "model" in result
            and "experiment" in result
            and "time_frequency" in result
            and "realm" in result
            and "cmor_table" in result
            and "ensemble" in result
        ):
            drsid = "%s.%s.%s.%s.%s.%s.%s.%s.%s" % (
                DRS_ACTIVITY,
                result["product"],
                result["institute"],
                result["model"],
                result["experiment"],
                result["time_frequency"],
                result["realm"],
                result["cmor_table"],
                result["ensemble"],
            )
            result["drs_id"] = drsid

        return result
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
Ejemplo n.º 44
0
                                     'cf-standard-name-table.xml')
        except:
            raise ESGPublishError("No standard name table specified.")

    try:
        tree = parse(path)
    except Exception, e:
        raise ESGPublishError("Error parsing %s: %s" % (path, e))
    root = tree.getroot()
    standardNames = {}
    for node in root:
        if node.tag == 'entry':
            name = node.attrib['id'].strip()
            if len(name) > MAX_STANDARD_NAME_LENGTH:
                warning(
                    "Standard_name is too long.  Schema requires standard_name to be <= %d characters\n  %s"
                    % (MAX_STANDARD_NAME_LENGTH, name))
                continue

            units = amip = grib = description = ''
            for subnode in node:
                if subnode.tag == 'canonical_units':
                    units = subnode.text.strip()
                elif subnode.tag == 'amip':
                    amip = subnode.text
                elif subnode.tag == 'grib':
                    grib = subnode.text
                elif subnode.tag == 'description':
                    description = subnode.text
                else:
                    raise ESGPublishError(
Ejemplo n.º 45
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", ".*\.nc$")
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError("Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s" % projectName, service)
            offlineLister = config.get(listerSection, "offline_lister_executable")
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
            ):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(
            projectName,
            dmap,
            directoryMap,
            datasetNames,
            Session,
            aggregateDimension,
            publishOp,
            filefilt,
            initcontext,
            offline,
            properties,
            keepVersion=keepVersion,
            newVersion=version,
            extraFields=extraFields,
            masterGateway=masterGateway,
            comment=message,
            readFiles=readFiles,
        )

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog,
    )

    return result
Ejemplo n.º 46
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # list of all target variables of a dataset
    dset_target_vars = set()

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:
            if filevar.is_target_variable:
                dset_target_vars.add(filevar.short_name)

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Ejemplo n.º 47
0
            from pkg_resources import resource_filename
            path = resource_filename('esgcet.config.etc', 'cf-standard-name-table.xml')
        except:
            raise ESGPublishError("No standard name table specified.")

    try:
        tree = parse(path)
    except Exception, e:
        raise ESGPublishError("Error parsing %s: %s"%(path, e))
    root = tree.getroot()
    standardNames = {}
    for node in root:
        if node.tag == 'entry':
            name = node.attrib['id'].strip()
            if len(name) > MAX_STANDARD_NAME_LENGTH:
                warning("Standard_name is too long.  Schema requires standard_name to be <= %d characters\n  %s"%(MAX_STANDARD_NAME_LENGTH, name))
                continue

            units = amip = grib = description = ''
            for subnode in node:
                if subnode.tag == 'canonical_units':
                    units = subnode.text.strip()
                elif subnode.tag == 'amip':
                    amip = subnode.text
                elif subnode.tag == 'grib':
                    grib = subnode.text
                elif subnode.tag == 'description':
                    description = subnode.text
                else:
                    raise ESGPublishError("Invalid standard name table tag: %s"%subnode.tag)
Ejemplo n.º 48
0
    def readContext(self, cdfile, model=''):
        "Get a dictionary of keys from an open file"
        result = BasicHandler.readContext(self, cdfile)
        f = cdfile.file

        for key, value in cmorAttributes.items():
            try:
                result[key] = getattr(f, value)
                if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray:
                    res = str(result[key][0])
                    if key=='run_name':
                        if res[0:3]!='run':
                            res = 'run'+res
                    result[key] = res
            except:
                pass

        if 'realization' in result and 'initialization_method' in result and 'physics_version' in result:
            ensemble = 'r%si%sp%s'%(result['realization'], result['initialization_method'], result['physics_version'])
            result['ensemble'] = ensemble
            result['run_name'] = ensemble

        base = os.path.basename(cdfile.path)
        try:
            index = base.index('_')
            varname = base[0:index]
            result['variable'] = varname
        except:
            warning("File path must have the form varname_XXX: %s"%cdfile.path)

        #!WARNING: I think all TAMIP2 data goes into output1
        result['product'] = 'output1'

        self.mapEnumeratedValues(result)

        # If realm has multiple fields, pick the first one
        if 'realm' in result:
            realm = result['realm'].strip()
            if realm.find(' ')!=-1:
                realms = realm.split(' ')
                result['realm'] = realms[0]

        # Parse CMOR table.
        if 'table_id' in result:
            tableId = result['table_id']
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields)>1 and (fields[1] in cmorTables):
                table = fields[1]
                result['cmor_table'] = table
            else:
                result['cmor_table'] = 'noTable'
        else:
            result['cmor_table'] = 'noTable'


        # Cache a 'drs_id' attribute for DRS-style dataset lookups
        validateDRSFieldValues(result, cdfile)
        if 'product' in result and 'institute' in result and 'model' in result and 'experiment' in result and 'time_frequency' in result and 'realm' in result and 'cmor_table' in result and 'ensemble' in result:
            drsid = '%s.%s.%s.%s.%s.%s.%s.%s.%s'%(DRS_ACTIVITY, result['product'], result['institute'], result['model'], result['experiment'], result['time_frequency'], result['realm'], result['cmor_table'], result['ensemble'])
            result['drs_id'] = drsid
            

        return result
Ejemplo n.º 49
0
    def evt_remove_dataset(self, parent):
        from esgcet.publish import pollDatasetPublicationStatus

        # Start the busy routine to indicate to the users something is happening
        parent.busyCursor = 'watch'
        parent.busyWidgets = [
            parent.pane2.pane('EditPaneTop'),
            parent.pane2.pane('EditPaneBottom'),
            parent.pane2.pane('EditPaneStatus'),
            parent.pane.pane('ControlPane')
        ]
        pub_busy.busyStart(parent)

        datasetNames = []
        GUI_line = {}
        DELETE = 1
        #UNPUBLISH = 2
        NO_OPERATION = 3
        DeleteLocalDB = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox1(
        )  #   DeleteLocalDB
        DeleteGateway = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox2(
        )  #   DeleteGateway
        DeleteThredds = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox3(
        )  #   DeleteThredds

        selected_page = parent.main_frame.selected_top_page
        if selected_page is not None:
            tab_name = parent.top_notebook.getcurselection()
            for x in parent.main_frame.top_page_id[selected_page]:
                if parent.main_frame.top_page_id[selected_page][x].cget(
                        'bg') != 'salmon' and parent.main_frame.top_page_id2[
                            selected_page][x].cget('bg') != 'salmon':
                    dset_name = parent.main_frame.top_page_id2[selected_page][
                        x].cget('text')

                    #dsetVersionName1 = self.parent.parent.main_frame.top_page_id2v[selected_page][x].cget('text')
                    #query_name, dset_version = parseDatasetVersionId(dsetVersionName1)
                    """ ganz I am modifying this so that if a user selects a dataset without a version then we delete all versions of that dataset"""
                    try:
                        dset_version = parent.main_frame.version_label[
                            selected_page][x].cget('text')
                    except:
                        dset_version = -1
                        #print 'Delete all versions'
                    #dset_version = 1
                    if (dset_version == 'N/A' or not dset_version):
                        dset_version = -1
                        # continue   # not published, yet
                    # Only delete published events
                    status = pollDatasetPublicationStatus(
                        dset_name, self.Session)
                    if status == 3 or DeleteGateway or DeleteThredds or DeleteLocalDB:
                        #datasetNames.append(generateDatasetVersionId((dset_name, dset_version)))
                        datasetNames.append([
                            dset_name, dset_version
                        ])  # ganz create name/version to delete
                    else:
                        parent.main_frame.top_page_id[selected_page][
                            x].configure(relief='raised',
                                         background='salmon',
                                         image=self.off)
                    GUI_line[dset_name] = x
                else:
                    if parent.main_frame.top_page_id2[selected_page][x].cget(
                            'bg') == 'salmon':
                        parent.main_frame.top_page_id[selected_page][
                            x].configure(relief='raised',
                                         background='salmon',
                                         image=self.off)
        else:
            warning(
                "%d: No pages generated for selection. Remove is only used to remove datasets from the Publisher."
                % logging.WARNING)

        # Remove dataset from the gateway, etc.
        if ((DeleteGateway == 0 or DeleteThredds == 0) and DeleteLocalDB == 1):
            ans = self.warn_On_Removal()
            if (ans == FALSE):
                return

        if DeleteGateway == 1:
            gatewayOp = DELETE
        else:
            gatewayOp = NO_OPERATION

    # now decide if there is anything to do
        if (gatewayOp == 1 or DeleteThredds == 1 or DeleteLocalDB == 1):
            las = False
            thredds = (DeleteThredds == 1)
            deleteDset = (DeleteLocalDB == 1)

            testProgress = (parent.parent.statusbar.show, 0, 100)
            status_dict = deleteDatasetList(datasetNames,
                                            self.Session,
                                            gatewayOp,
                                            thredds,
                                            las,
                                            deleteDset,
                                            progressCallback=testProgress)

        # Show the published status
        try:
            for x in status_dict.keys():
                status = status_dict[x]
                parent.main_frame.status_label[selected_page][
                    GUI_line[x]].configure(
                        text=pub_controls.return_status_text(status))
        except:
            pass

        pub_busy.busyEnd(parent)
        # ganz refresh [if there were no exceptions] dataset list after deletions
        parent.pub_buttonexpansion.query_widgets.parent.parent.ntk.evt_refresh_list_of_datasets(
            selected_page)
Ejemplo n.º 50
0
    def __invoke(self, method, params):
        # call a method on the remote server

        request = HessianWriter().write_call(method, params)

        # ----------------------------------------------------------------------
        # Patch for HTTP proxy support starts here.  [email protected]
        #
        import httplib, os, urlparse, ssl

        if self._scheme == "http":
            proxy_url = os.environ.get('http_proxy')
            if proxy_url is not None:
                if DEBUG:
                    messaging.info('Proxy detected at %s' % proxy_url)
                proxy_parts = urlparse.urlparse(proxy_url)
                proxy_host = proxy_parts.hostname
                proxy_port = proxy_parts.port
                if proxy_port is None:
                    proxy_port = 80
                h = httplib.HTTPConnection(proxy_host, port=proxy_port)
            else:
                h = httplib.HTTPConnection(self._host, port=self._port)
        else:
            ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
            conn_args = {'port' : self._port,
                         'key_file' : self._key_file,
                         'cert_file': self._cert_file,
                         'context': ctx}
            h = httplib.HTTPSConnection(self._host, **conn_args)

            # test the connection - may need unverified with test index nodes
            # (hopefully not with operational nodes)
            try:
                h.request("HEAD", "/")
                h.getresponse()
            except ssl.SSLError:
                messaging.warning('SSL error - disabling SSL verification')
                conn_args['context'] = ssl._create_unverified_context()
                h = httplib.HTTPSConnection(self._host, **conn_args)

        req_headers = {'Host': self._host,
                       'User-Agent': "hessianlib.py/%s" % __version__,
                       'Content-Length': str(len(request)),
                       }

        if DEBUG:
            messaging.info('Sending request: %s' % `request`)
        h.request("POST", self._url, request, req_headers)
        #
        # End Patch from [email protected]
        # ----------------------------------------------------------------------

        response = h.getresponse()
        headers = response.getheaders()
        errcode = response.status
        errmsg = response.reason
        # errcode, errmsg, headers = h.getreply()

        if errcode != 200:
            raise ProtocolError(self._url, errcode, errmsg, headers)

        # return self.parse_response(h.getfile())
        if DEBUG:
            messaging.info('Got response:')
        responseProxy = ResponseProxy(response)
        return self.parse_response(responseProxy)
Ejemplo n.º 51
0
    def readContext(self, cdfile, model=''):
        "Get a dictionary of keys from an open file"
        result = BasicHandler.readContext(self, cdfile)
        f = cdfile.file

        for key, value in cmorAttributes.items():
            try:
                result[key] = getattr(f, value)
                if key in cmorArrayAttributes and type(
                        result[key]) is numpy.ndarray:
                    res = str(result[key][0])
                    if key == 'run_name':
                        if res[0:3] != 'run':
                            res = 'run' + res
                    result[key] = res
            except:
                pass

        if 'realization' in result and 'initialization_method' in result and 'physics_version' in result:
            ensemble = 'r%si%sp%s' % (result['realization'],
                                      result['initialization_method'],
                                      result['physics_version'])
            result['ensemble'] = ensemble
            result['run_name'] = ensemble

        base = os.path.basename(cdfile.path)
        try:
            index = base.index('_')
            varname = base[0:index]
            result['variable'] = varname
        except:
            warning("File path must have the form varname_XXX: %s" %
                    cdfile.path)

        #!WARNING: I think all TAMIP2 data goes into output1
        result['product'] = 'output1'

        self.mapEnumeratedValues(result)

        # If realm has multiple fields, pick the first one
        if 'realm' in result:
            realm = result['realm'].strip()
            if realm.find(' ') != -1:
                realms = realm.split(' ')
                result['realm'] = realms[0]

        # Parse CMOR table.
        if 'table_id' in result:
            tableId = result['table_id']
            fields = tableId.split()

            # Assume table ID has the form 'Table table_id ...'
            if len(fields) > 1 and (fields[1] in cmorTables):
                table = fields[1]
                result['cmor_table'] = table
            else:
                result['cmor_table'] = 'noTable'
        else:
            result['cmor_table'] = 'noTable'

        # Cache a 'drs_id' attribute for DRS-style dataset lookups
        validateDRSFieldValues(result, cdfile)
        if 'product' in result and 'institute' in result and 'model' in result and 'experiment' in result and 'time_frequency' in result and 'realm' in result and 'cmor_table' in result and 'ensemble' in result:
            drsid = '%s.%s.%s.%s.%s.%s.%s.%s.%s' % (
                DRS_ACTIVITY, result['product'], result['institute'],
                result['model'], result['experiment'],
                result['time_frequency'], result['realm'],
                result['cmor_table'], result['ensemble'])
            result['drs_id'] = drsid

        return result
Ejemplo n.º 52
0
def iterateOverDatasets(projectName,
                        dmap,
                        directoryMap,
                        datasetNames,
                        Session,
                        aggregateDimension,
                        operation,
                        filefilt,
                        initcontext,
                        offlineArg,
                        properties,
                        testProgress1=None,
                        testProgress2=None,
                        handlerDictionary=None,
                        perVariable=None,
                        keepVersion=False,
                        newVersion=None,
                        extraFields=None,
                        masterGateway=None,
                        comment=None,
                        forceAggregate=False,
                        readFiles=False,
                        nodbwrite=False,
                        pid_connector=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct):
        datasetName, versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError(
                    "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list."
                )

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s" %
                                      datasetName)

        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate = False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName, versionno)]) == 0:
                warning("No files specified for dataset %s, version %d." %
                        (datasetName, versionno))
                continue
            firstFile = dmap[(datasetName, versionno)][0][0]
            fileiter = datasetMapIterator(dmap,
                                          datasetName,
                                          versionno,
                                          extraFields=extraFields,
                                          offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter = multiDirectoryIterator(
                    [direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator(
                    [sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(
                datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=True,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name
            info("Using project name = %s" % projectName)
        if prevProject is not None and projectName != prevProject:
            raise ESGPublishError(
                "Multiple projects found: %s, %s. Can only publish from one project"
                % (prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored' % name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset = None
        if testProgress1 is not None:
            testProgress1[1] = (100. / ct) * iloop
            if not offline:
                testProgress1[2] = (100. / ct) * iloop + (50. / ct)
            else:
                testProgress1[2] = (100. / ct) * iloop + (100. / ct)

        dataset = extractFromDataset(datasetName,
                                     fileiter,
                                     Session,
                                     handler,
                                     cfHandler,
                                     aggregateDimensionName=aggregateDimension,
                                     offline=offline,
                                     operation=operation,
                                     progressCallback=testProgress1,
                                     perVariable=perVariable,
                                     keepVersion=keepVersion,
                                     newVersion=newVersion,
                                     extraFields=extraFields,
                                     masterGateway=masterGateway,
                                     comment=comment,
                                     useVersion=versionno,
                                     forceRescan=forceAggregate,
                                     nodbwrite=nodbwrite,
                                     pid_connector=pid_connector,
                                     **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
            testProgress2[1] = (100. / ct) * iloop + 50. / ct
            testProgress2[2] = (100. / ct) * (iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName,
                               Session,
                               aggregateDimensionName=aggregateDimension,
                               cfHandler=cfHandler,
                               progressCallback=testProgress2,
                               datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)

        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
Ejemplo n.º 53
0
def datasetOrVersionName(name, version, session, deleteAll=False, restInterface=False):
    """
    Determine if the name refers to a dataset or dataset version.

    Returns (deleteAll, datasetObj, [versionObjs], isLatestVersion) where:

    datasetObj is the related dataset object, or None if neither the dataset or version is found;
    [versionObj] is a list of version objects to be deleted. isLatestVersion is True iff this
    version is the latest one for the dataset. It is not considered an error if the version
    does not exist in the local database, since it may still exist in THREDDS and/or the gateway.

    name
      String name to look up.

    session
      A database Session **instance**.

    version
      Version to delete. If version is -1, all version objects for the dataset are returned.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    restInterface
      Boolean, if True then name has the form 'master_id.version|data_node'.

    """

    # Parse a SOLR dataset ID if the RESTful interface is used
    if restInterface:
        saveName = name
        name, version, data_node = parseSolrDatasetId(name)
        if data_node is None:
            warning("Dataset: %s, REST interface dataset identifiers should have the form dataset_id|data_node"%saveName)

    # Lookup the dataset
    dset = session.query(Dataset).filter_by(name=name).first()

    deleteAll = (deleteAll or version==-1)
    isLatest = False
    if dset is None:
        dsetVersionObjs = []
    else:                               # It's a dataset

        # Check if this is the latest version
        versionObj = dset.getVersionObj(version=version)
        if versionObj is None:
            warning("Version %d of dataset %s not found"%(version, dset.name))
            isLatest = False
        else:
            isLatest = versionObj.isLatest()
            
        # If this is the only version, delete the entire dataset
        deleteAll = deleteAll or (versionObj is not None and len(dset.versions)==1)

        if deleteAll:
            dsetVersionObjs = dset.versions
        else:
            if versionObj is None:
                dsetVersionObjs = []
            else:
                dsetVersionObjs = [versionObj]

    return (deleteAll, dset, dsetVersionObjs, isLatest)
Ejemplo n.º 54
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            service_certs_location = getServiceCertsLoc()
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                if service.service_type == 'REST':
                    error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName)
                    continue
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Ejemplo n.º 55
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:'+self.name
        datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())
            
            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions)
            if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)'%optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern)
            
            if newinit[-1]!='$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value!=context[key]:
                    warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`))

        return context
Ejemplo n.º 56
0
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None,
                      deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None):
    """
    Delete or retract a list of datasets:

    - Delete the dataset from the gateway.
    - Remove the catalogs from the THREDDS catalog (optional).
    - Reinitialize the LAS server and THREDDS server.
    - Delete the database entry (optional).

    if republish is False:
      Returns a status dictionary: datasetName => status
    else
      Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished.

    datasetNames
      A list of )dataset_name, version) tuples.

    Session
      A database Session.

    gatewayOperation
      An enumeration. If:
      - publish.DELETE: Remove all metadata from the gateway database.
      - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway.
      - publish.NO_OPERATION: No gateway delete/retract operation is called.

    thredds
      Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server.

    las  
      Boolean flag: if true (the default), reinitialize server.

    deleteInDatabase
      Boolean flag: if true (default is False), delete the database entry.
    
    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    republish
      Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished.

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    data_node
        String, the datanode to unpublish (only for unpublication from Solr)

    """
    if gatewayOperation == UNINITIALIZED:
        raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!")

    if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION):
        raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation)
    deleteOnGateway = (gatewayOperation==DELETE)
    operation = (gatewayOperation!=NO_OPERATION)

    session = Session()
    resultDict = {}
    config = getConfig()

    # Check the dataset names and cache the results for the gateway, thredds, and database phases
    nameDict = {}
    for datasetName,version in datasetNames:
        isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface)
        if dset is None:
            warning("Dataset not found in node database: %s"%datasetName)
        nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest)

    # Delete the dataset from the gateway.
    if operation:

        # Create the web service proxy
        threddsRootURL = config.get('DEFAULT', 'thredds_url')
        serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile')
        serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile')
        if not restInterface:
            serviceURL = getHessianServiceURL(project_config_section=project_config_section)
            servicePort = config.getint('DEFAULT','hessian_service_port')
            serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug')
            service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug)
        else:
            serviceURL = getRestServiceURL(project_config_section=project_config_section)
            serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False)
            service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug)

        for datasetName,version in datasetNames:
            if version > -1:
                datasetToUnpublish = '%s.v%s' % (datasetName, version)
            else:
                datasetToUnpublish = datasetName
            isDataset, dset, versionObjs, isLatest = nameDict[datasetName]
            try:
                eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node)
            except RemoteCallException, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n')))
                continue
            except ESGPublishError, e:
                fields = `e`.split('\n')
                error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n')))
                continue
            info("  Result: %s"%stateName)
            resultDict[datasetName] = eventName
Ejemplo n.º 57
0
def datasetOrVersionName(name, version, session, deleteAll=False, restInterface=False):
    """
    Determine if the name refers to a dataset or dataset version.

    Returns (deleteAll, datasetObj, [versionObjs], isLatestVersion) where:

    datasetObj is the related dataset object, or None if neither the dataset or version is found;
    [versionObj] is a list of version objects to be deleted. isLatestVersion is True iff this
    version is the latest one for the dataset. It is not considered an error if the version
    does not exist in the local database, since it may still exist in THREDDS and/or the gateway.

    name
      String name to look up.

    session
      A database Session **instance**.

    version
      Version to delete. If version is -1, all version objects for the dataset are returned.

    deleteAll
      Boolean, if True delete all versions of the dataset(s).

    restInterface
      Boolean, if True then name has the form 'master_id.version|data_node'.

    """

    # Parse a SOLR dataset ID if the RESTful interface is used
    if restInterface:
        saveName = name
        name, version, data_node = parseSolrDatasetId(name)
        if data_node is None:
            warning("Dataset: %s, REST interface dataset identifiers should have the form dataset_id|data_node"%saveName)

    # Lookup the dataset
    dset = session.query(Dataset).filter_by(name=name).first()

    deleteAll = (deleteAll or version==-1)
    isLatest = False
    if dset is None:
        dsetVersionObjs = []
    else:                               # It's a dataset

        # Check if this is the latest version
        versionObj = dset.getVersionObj(version=version)
        if versionObj is None:
            warning("Version %d of dataset %s not found"%(version, dset.name))
            isLatest = False
        else:
            isLatest = versionObj.isLatest()
            
        # If this is the only version, delete the entire dataset
        deleteAll = deleteAll or (versionObj is not None and len(dset.versions)==1)

        if deleteAll:
            dsetVersionObjs = dset.versions
        else:
            if versionObj is None:
                dsetVersionObjs = []
            else:
                dsetVersionObjs = [versionObj]

    return (deleteAll, dset, dsetVersionObjs, isLatest)
    def start_harvest( self, parent ):
        from esgcet.publish import publishDatasetList
        from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL

        dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue', 0.8 )

        # Make sure the publisher is logged in
       # if not self.parent.parent.password_flg:
       #    self.parent.parent.menu.login_menu.evt_login( self.parent.parent )

        # Start the busy routine to indicate to the users something is happening
        self.parent.parent.busyCursor = 'watch'
        self.parent.parent.busyWidgets = [self.parent.parent.pane2.pane( 'EditPaneTop' ), self.parent.parent.pane2.pane( 'EditPaneBottom' ), self.parent.parent.pane2.pane( 'EditPaneStatus' ), self.parent.parent.pane.pane( 'ControlPane' )]
        pub_busy.busyStart( self.parent.parent )
        try:
        # Generate the list of datasets to be published
           datasetNames=[]
           GUI_line = {}
           tab_name = self.parent.parent.top_notebook.getcurselection()
           selected_page = self.parent.parent.main_frame.selected_top_page

           if (selected_page is None):
              warning("Must generate a list of datasets to scan before publishing can occur.")
              pub_busy.busyEnd( self.parent.parent )
              return

           for x in self.parent.parent.main_frame.top_page_id[selected_page]:

               if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon' and self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') != 'salmon':
                   dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')
                #######################################
                                  # ganz added this 1/18/11    
                   versionNum = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') 
                   dsetTuple = (dset_name, versionNum)
                #dsetName = generateDatasetVersionId(dsetTuple)                
                  #####################################################################################
                # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary
                   datasetNames.append(dsetTuple)
                   GUI_line[ dset_name ] = x
               else:
                   if self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') == 'salmon':
                      self.parent.parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off)

        # Publish collection of datasets
           testProgress = (self.parent.parent.statusbar.show, 0, 100)
           publishThredds = (quality_control_widgets.get_CheckBox3()==1)
           publishGateway = (quality_control_widgets.get_CheckBox2()==1)
           if (publishThredds):
               print 'publishing to Thredds'
           if (publishGateway):
               print 'publishing to Gateway'  
                      
           status_dict = publishDatasetList(datasetNames, self.Session, publish=publishGateway, thredds=publishThredds, progressCallback=testProgress)

        # Show the published status
           for x in status_dict.keys():
                status = status_dict[ x ]
                dsetName, versionNo = x
                dsetVersionName = generateDatasetVersionId(x)
                guiLine = GUI_line[dsetName] # dsetVersionName]
            
                self.parent.parent.main_frame.status_label[selected_page][guiLine].configure(text=pub_controls.return_status_text( status) )
                dset = Dataset.lookup(dsetName, self.Session)
                if dset.has_warnings(self.Session):
                    warningLevel = dset.get_max_warning_level(self.Session)
                    if warningLevel>=ERROR_LEVEL:
                        buttonColor = "pink"
                        buttonText = "Error"
                    else:
                        buttonColor = "yellow"
                        buttonText = "Warning"
                    self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure(
                        text = buttonText,
                        bg = buttonColor,
                        relief = 'raised',
                        command = pub_controls.Command( self.parent.parent.pub_buttonexpansion.extraction_widgets.error_extraction_button, dset ) )
                else:
                    self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure(
                        text = 'Ok',
                        bg = dcolor1,
                        highlightcolor = dcolor1,
                        relief = 'sunken',
                        )
        except:
            pub_busy.busyEnd( self.parent.parent )  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
           pub_busy.busyEnd( self.parent.parent )
           self.my_refresh()