Beispiel #1
0
def instantiateHandler(cls, *args, **extra_args):
    """
    Instantiate the handler class with the specified arguments and extra arguments,
    but filtering out anything that it doesn't support.
    """
    passed_args = {}
    supported_args = inspect.getargspec(cls.__init__).args
    for k, v in extra_args.iteritems():
        if k in supported_args:
            passed_args[k] = v
        else:
            debug("Discarding arg '%s' not supported by handler" % k)
    return cls(*args, **passed_args)
Beispiel #2
0
def instantiateHandler(cls, *args, **extra_args):
    """
    Instantiate the handler class with the specified arguments and extra arguments,
    but filtering out anything that it doesn't support.
    """
    passed_args = {}
    supported_args = inspect.getargspec(cls.__init__).args
    for k, v in extra_args.iteritems():
        if k in supported_args:
            passed_args[k] = v
        else:
            debug("Discarding arg '%s' not supported by handler" % k)
    return cls(*args, **passed_args)
Beispiel #3
0
    def __init__(self, url, port=None, key_file=None, cert_file=None, debug=False):
        """
        Create a Hessian proxy.

        url
          String of the form http[s]://host/path. Note that the port is specified separately.

        port
          Defaults to 80 for http, 443 for https.

        key_file
          Key file in PEM format, if the scheme is https and client authentication is to be used.

        cert_file
          User certificate in PEM format, if the scheme is https and client authentication is to be used.

        debug
          True iff debug info is to be printed.

        """
        # Creates a Hessian proxy object
        global DEBUG

        self.service_type = 'HESSIAN'
        self._url = url
        self._port = port
        self._key_file = key_file
        self._cert_file = cert_file
        # print "Using key file = %s, cert file = %s"%(key_file, cert_file)
        messaging.debug("Using key file = %s, cert file = %s" % (key_file, cert_file))
        if debug:
            DEBUG = True

        # get the uri
        scheme, uri = urllib.splittype(url)
        if scheme not in ["http", "https"]:
            raise IOError, "unsupported Hessian protocol"

        self._scheme = scheme
        self._host, self._uri = urllib.splithost(uri)
Beispiel #4
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        if self.replica:
            debug("skipping PrePARE for replica (file %s)" % f)
            return

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0")
        min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0")
        data_specs_version = config.get(project_config_section, "data_specs_version", default="master")
        cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH)

        try:
            file_cmor_version = fileobj.getAttribute('cmor_version', None)
        except:
            file_cmor_version = None
            debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f)

        passed_cmor = False
        if compareLibVersions(min_cmor_version, file_cmor_version):
            debug('File %s cmor-ized at version %s, passed!"'%(f, file_cmor_version))
            passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError("File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError("File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute('data_specs_version', None)
        except Exception as e:
            raise ESGPublishError("File %s missing required data_specs_version global attribute"%f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if passed_cmor:
            return
            
        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        checkAndUpdateRepo(cmor_table_path, data_specs_version)

        try:
            process = validator.checkCMIP6(cmor_table_path)
            if process is None:
                raise ESGPublishError("File %s failed the CV check - object create failure"%f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check"%f)
Beispiel #5
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        config = getConfig()
        projectSection = 'project:' + self.name
        min_cmor_version = config.get(projectSection,
                                      "min_cmor_version",
                                      default="0.0.0")

        file_cmor_version = "0.0.0"

        try:
            file_cmor_version = fileobj.getAttribute('cmor_version', None)
        except:
            debug(
                'File %s missing cmor_version attribute; will proceed with PrePARE check'
                % f)

        if compareLibVersions(min_cmor_version, file_cmor_version):
            debug('File %s cmor-ized at version %s, passed!"' %
                  (f, file_cmor_version))
            return

            #  PrePARE is going to handle the CF check now
        # min_cf_version = config.get(projectSection, "min_cf_version", defaut="")

        # if len(min_cf_version) == 0:
        #     raise ESGPublishError("Minimum CF version not set in esg.ini")

        # fakeversion = ["cfchecker.py", "-v", min_cf_version
        # , "foo"]
        # (badc,coards,uploader,useFileName,standardName,areaTypes,udunitsDat,version,files)=getargs(fakeversion)
        # CF_Chk_obj = CFChecker(uploader=uploader, useFileName=useFileName, badc=badc, coards=coards, cfStandardNamesXML=standardName, cfAreaTypesXML=areaTypes, udunitsDat=udunitsDat, version=version)
        # rc = CF_Chk_obj.checker(f)

        # if (rc > 0):
        #     raise ESGPublishError("File %s fails CF check"%f)

        file_data_specs_version = None
        try:
            file_data_specs_version = fileobj.getAttribute(
                'data_specs_version', None)
        except Exception as e:
            raise ESGPublishError(
                "File %s missing required data_specs_version global attribute"
                % f)

        table = None
        try:
            table = fileobj.getAttribute('table_id', None)

        except:
            raise ESGPublishError(
                "File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)

        except:
            raise ESGPublishError(
                "File %s missing required variable_id global attribute" % f)

        project_section = 'config:cmip6'

        cmor_table_path = ""
        try:
            cmor_table_path = config.get(projectSection,
                                         "cmor_table_path",
                                         defaut="")
        except:
            debug("Missing cmor_table_path setting. Using default location")

        if cmor_table_path == "":
            cmor_table_path = DEFAULT_CMOR_TABLE_PATH

        checkAndUpdateRepo(cmor_table_path, self, file_data_specs_version)

        table_file = cmor_table_path + '/CMIP6_' + table + '.json'
        fakeargs = ['--variable', variable_id, table_file, f]
        parser = argparse.ArgumentParser(prog='esgpublisher')
        parser.add_argument('--variable')
        parser.add_argument('cmip6_table', action=validator.JSONAction)
        parser.add_argument('infile', action=validator.CDMSAction)
        parser.add_argument('outfile',
                            nargs='?',
                            help='Output file (default stdout)',
                            type=argparse.FileType('w'),
                            default=sys.stdout)
        args = parser.parse_args(fakeargs)

        #        print "About to CV check:", f

        try:
            process = validator.checkCMIP6(args)
            if process is None:
                raise ESGPublishError(
                    "File %s failed the CV check - object create failure" % f)

            process.ControlVocab()

        except:

            raise ESGPublishError("File %s failed the CV check" % f)
Beispiel #6
0
def nodeIterator(top, nodefilt, filefilt, followSymLinks=True, allFiles=False):
    """Generate an iterator over non-empty directories that match a pattern.

    Returns an iterator that returns a tuple (*path*, *sample_file*, *groupdict*) at each iteration, where:

    - *path* is the node (directory) path
    - *sample_file* is a file in the node that matches the file filter
    - *groupdict* is the group dictionary generated by the match. For example, if *nodefilt* contains a named group '(?P<model>) that matches 'some_value', then *groupdict* maps 'model' => 'some_value'

    top
      A list or tuple of top level directory names.

    nodefilt
      A regular expression as defined in the Python re module. Each node returned matches the expression.
      May also be a list of regular expressions, in which case each node returned matches at least one expression
      in the list.

    filefilt
      A regular expression as defined in the Python re module. Each sample file returned has basename matching the filter.

    followSymLinks
      Boolean flag. Symbolic links are followed unless followSymLinks is False.

    allFiles = False
      Boolean flag. If True, iterate over all files that match the filter. Otherwise just return
      the first file that matches.

    """

    try:
        names = os.listdir(top)
    except os.error:
        return

    if type(nodefilt) is not type([]):
        nodefilt = [nodefilt]

    foundOne = False
    for basename in names:
        name = os.path.join(top, basename)
        try:
            if followSymLinks:
                st = os.stat(name)
            else:
                st = os.lstat(name)
        except os.error:
            continue

        # Search regular files in top directory
        if stat.S_ISREG(st.st_mode):
            if not foundOne or allFiles:

                # Find the first node filter that matches
                for filt in nodefilt:
                    result = re.match(filt, top)
                    debug("Comparing %s with filter %s ..."%(top, filt))
                    if result is not None:
                        debug("... match")
                        break
                    debug("... no match")
                    
                # If the node pattern matches and the file not a directory and the file filter matches:
                if (result is not None) and (re.match(filefilt, basename) is not None):
                    groupdict = result.groupdict()
                    foundOne = True
                    yield (top, basename, groupdict)
            
        # Search subdirectories
        elif stat.S_ISDIR(st.st_mode):
            for nodepath, filepath, gdict in nodeIterator(name, nodefilt, filefilt, followSymLinks=followSymLinks):
                yield (nodepath, filepath, gdict)

    return
Beispiel #7
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds]:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Beispiel #8
0
def extractFromFile(dataset, openfile, fileobj, session, cfHandler, aggdimName=None, varlocate=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname] = pattern

    # For each variable in the file:
    for varname in openfile.inquireVariableList():
        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname], os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        # Create a file variable
        filevar = FileVariable(varname, openfile.getAttribute('long_name', varname, None))
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above

    # Create global attribute
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname=='tracking_id':
            fileVersion.tracking_id = attvalue
        debug('.%s = %s'%(attname, attvalue))
Beispiel #9
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # list of all target variables of a dataset
    dset_target_vars = set()

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:
            if filevar.is_target_variable:
                dset_target_vars.add(filevar.short_name)

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
Beispiel #10
0
def extractFromFile(dataset, openfile, fileobj, session, handler, cfHandler, aggdimName=None, varlocate=None, exclude_variables=None, perVariable=None, **context):
    """
    Extract metadata from a file, add to a database.

    dataset
      The dataset instance.

    openfile
      An open netCDF file object.

    fileobj
      A (logical) file instance.

    session
      A database session instance.

    cfHandler
      A CF handler instance

    handler
      Project handler

    aggdimName
      The name of the dimension which is split across files, if any.

    varlocate
      List with elements [varname, pattern]. The variable will be extracted from the file only if the filename
      matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']]

    exclude_variables
        List of thredds_exclude_variables

    perVariable
        Boolean, Try to find a target_variable if true and extract all variables if false

    context
      A dictionary with keys project, model, experiment, and run.

    """

    fileVersion = fileobj.versions[-1]

    # Get the aggregate dimension range
    if aggdimName is not None and openfile.hasVariable(aggdimName):
        aggvarFirst = openfile.getVariable(aggdimName, index=0)
        aggvarLast = openfile.getVariable(aggdimName, index=-1)
        aggvarLen = openfile.inquireVariableShape(aggdimName)[0]
        aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName))
        if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"):
            if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12:
                dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE)

    if aggdimName is not None and not openfile.hasVariable(aggdimName):
        info("Aggregate dimension not found: %s"%aggdimName)

    varlocatedict = {}
    if varlocate is not None:
        for varname, pattern in varlocate:
            varlocatedict[varname.strip()] = pattern.strip()

    # Create global attribute
    target_variable = None
    for attname in openfile.inquireAttributeList():
        attvalue = openfile.getAttribute(attname, None)
        atttype, attlen = getTypeAndLen(attvalue)
        attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen)
        fileobj.attributes.append(attribute)
        if attname == 'tracking_id':
            fileVersion.tracking_id = attvalue
        # extract target_variable from global attributes
        if attname == 'variable_id' and perVariable:
            target_variable = attvalue
            debug('Extracted target variable from global attributes: %s' % target_variable)
        debug('.%s = %s' % (attname, attvalue))

    # try to get target_variable from DRS if not found in global attributes
    if not target_variable and perVariable:
        config = getConfig()
        if config is not None:
            drs_pattern = handler.getFilters()[0][1:-1]
            drs_file_pattern = '%s/(?P<filename>[\w.-]+)$' % drs_pattern
            drs_parts = re.search(drs_file_pattern, openfile.path).groupdict()
            if 'variable' in drs_parts:
                target_variable = drs_parts['variable']
                debug('Extracted target variable from DRS: %s' % target_variable)

    # target_variable must be present in the file
    if target_variable not in openfile.inquireVariableList():
        target_variable = None

    # For each variable in the file:
    for varname in openfile.inquireVariableList():

        # we need to extract only target, aggregation and coverage variables
        if target_variable:
            is_coverage_variable = check_coverage_variable(varname, openfile)
            if not is_coverage_variable and varname != target_variable and varname != aggdimName:
                debug("Skipping variable %s in %s (not target (%s), coverage or aggregation (%s) variable)" % (varname, fileVersion.location, target_variable, aggdimName))
                continue

        varshape = openfile.inquireVariableShape(varname)
        debug("%s%s"%(varname, `varshape`))

        # Check varlocate
        if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname].strip(), os.path.basename(fileVersion.location)):
            debug("Skipping variable %s in %s"%(varname, fileVersion.location))
            continue

        is_target_variable = True
        if target_variable and target_variable != varname:
            is_target_variable = False
        elif varname in exclude_variables:
            is_target_variable = False

        # Create a file variable
        varstr = openfile.getAttribute('long_name', varname, None)
        
        if not varstr is None and len(varstr) > 255:
            varstr = varstr[0:255]
        filevar = FileVariable(varname, varstr, is_target_variable=is_target_variable)
        fileobj.file_variables.append(filevar)

        # Create attributes:
        for attname in openfile.inquireAttributeList(varname):
            attvalue = openfile.getAttribute(attname, varname)
            atttype, attlen = getTypeAndLen(attvalue)
            attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen)
            filevar.attributes.append(attribute)
            debug('  %s.%s = %s'%(varname, attname, `attvalue`))

        # Create dimensions
        seq = 0
        dimensionList = openfile.inquireVariableDimensions(varname)
        for dimname, dimlen in zip(dimensionList, varshape):
            dimension = FileVariableDimension(dimname, dimlen, seq)
            filevar.dimensions.append(dimension)
            if dimname==aggdimName:
                filevar.aggdim_first = float(aggvarFirst)
                filevar.aggdim_last = float(aggvarLast)
                filevar.aggdim_units = aggvarunits
            seq += 1

        # Set coordinate axis range and type if applicable
        if len(varshape)==1:
            var0 = openfile.getVariable(varname, index=0)
            if var0 is None:
                continue
            varn = openfile.getVariable(varname, index=-1)
            if cfHandler.axisIsLatitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Y'
            elif cfHandler.axisIsLongitude(filevar):
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'X'
            elif cfHandler.axisIsLevel(filevar):
                vararray = openfile.getVariable(varname)
                filevar.coord_range = genCoordinateRange(var0, varn)
                if not isValidCoordinateRange(var0, varn):
                    warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname))
                filevar.coord_type = 'Z'
                filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above
Beispiel #11
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP,
                       progressCallback=None, stopEvent=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None,
                       comment=None, useVersion=-1, forceRescan=False, nodbwrite=False, pid_connector=None, test_publication=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    pid_connector
        ESGF_PID_connector object to register PIDs

    test_publication
        Flag whether publication is for production or test

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)

        if not offline:
            if perVariable is None:
                perVariable = config.getboolean(section, 'variable_per_file', False)
            else:
                perVariable = False
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False

    exclude_variables = splitLine(config.get(section, 'thredds_exclude_variables', default=''), sep=',')

    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType
    configOptions['exclude_variables'] = exclude_variables
    configOptions['perVariable'] = perVariable

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if (nodbwrite): 
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        info("dataset scan complete, not writing to database")
        return dset
       
    elif operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, useVersion=useVersion, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        if operation==REPLACE_OP:
            versionObj = dset.getVersionObj(-1)
        else:
            versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, useVersion=useVersion, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if useVersion == -1:
        if keepVersion:
            if existingVersion<=0:
                newVersion = getInitialDatasetVersion(versionByDate)
            else:
                newVersion = existingVersion
        elif newVersion is None:
            newVersion = getNextDatasetVersion(existingVersion, versionByDate)
    else:
        newVersion = useVersion

    dset.reaggregate = False

    if newVersion<existingVersion:
        versionList = dset.getVersionList()
        if newVersion in versionList:
            addNewVersion = False

    # Add a new version
    if addNewVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title

        # if project uses PIDs, generate PID for dataset
        dataset_pid = None
        if pid_connector:
            dataset_pid = pid_connector.make_handle_from_drsid_and_versionnumber(drs_id=datasetName, version_number=newVersion)
            info("Assigned PID to dataset %s.v%s: %s " % (datasetName, newVersion, dataset_pid))

        # if project uses citation, build citation url
        project_config_section = 'config:%s' %context.get('project')
        citation_url = handler.get_citation_url(project_config_section, config, datasetName, newVersion, test_publication)

        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes,
                                                  tech_notes_title=datasetTechNotesTitle, pid=dataset_pid, citation_url=citation_url)

        info("New dataset version = %d"%newDsetVersionObj.version)
        
        try:
            for var in dset.variables:
                session.delete(var)
        except IntegrityError as ie:
            debug("sqlalchemy IntegrityError: " + str(ie))
            raise ESGPublishError("Error in creating dataset version, did you already publish this version to the database?")
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset
Beispiel #12
0
def nodeIterator(top, nodefilt, filefilt, followSymLinks=True, allFiles=False):
    """Generate an iterator over non-empty directories that match a pattern.

    Returns an iterator that returns a tuple (*path*, *sample_file*, *groupdict*) at each iteration, where:

    - *path* is the node (directory) path
    - *sample_file* is a file in the node that matches the file filter
    - *groupdict* is the group dictionary generated by the match. For example, if *nodefilt* contains a named group '(?P<model>) that matches 'some_value', then *groupdict* maps 'model' => 'some_value'

    top
      A list or tuple of top level directory names.

    nodefilt
      A regular expression as defined in the Python re module. Each node returned matches the expression.
      May also be a list of regular expressions, in which case each node returned matches at least one expression
      in the list.

    filefilt
      A regular expression as defined in the Python re module. Each sample file returned has basename matching the filter.

    followSymLinks
      Boolean flag. Symbolic links are followed unless followSymLinks is False.

    allFiles = False
      Boolean flag. If True, iterate over all files that match the filter. Otherwise just return
      the first file that matches.

    """

    try:
        names = os.listdir(top)
    except os.error:
        return

    if type(nodefilt) is not type([]):
        nodefilt = [nodefilt]

    foundOne = False
    for basename in names:
        name = os.path.join(top, basename)
        try:
            if followSymLinks:
                st = os.stat(name)
            else:
                st = os.lstat(name)
        except os.error:
            continue

        # Search regular files in top directory
        if stat.S_ISREG(st.st_mode):
            if not foundOne or allFiles:

                # Find the first node filter that matches
                for filt in nodefilt:
                    result = re.match(filt, top)
                    debug("Comparing %s with filter %s ..." % (top, filt))
                    if result is not None:
                        debug("... match")
                        break
                    debug("... no match")

                # If the node pattern matches and the file not a directory and the file filter matches:
                if (result is not None) and (re.match(filefilt, basename)
                                             is not None):
                    groupdict = result.groupdict()
                    foundOne = True
                    yield (top, basename, groupdict)

        # Search subdirectories
        elif stat.S_ISDIR(st.st_mode):
            for nodepath, filepath, gdict in nodeIterator(
                    name, nodefilt, filefilt, followSymLinks=followSymLinks):
                yield (nodepath, filepath, gdict)

    return
Beispiel #13
0
def checkAndUpdateRepo(cmor_table_path, ds_version):
    """
        Checks for a file written to a predefined location.  if not present or too old, will pull the repo based on the input path argument and update the timestamp.
    """
    # This is run during handler initialization and not for each file validation

    # Pull repo if fetched more than one day ago
    # or if never fetched before
    if os.path.exists(UPDATE_TIMESTAMP):
        mtime = os.path.getmtime(UPDATE_TIMESTAMP)
        now = time()
        if now - mtime > (86400.0):
            pull_cmor_repo = True
        else:
            pull_cmor_repo = False
    else:
        pull_cmor_repo = True

    if pull_cmor_repo:
        try:
            # Go into CMOR table path
            # Git fetch CMOR table repo
            # Go back to previous working directory
            checkedRun(('cd {} && git fetch --quiet').format(cmor_table_path))
            # Update local timestamp
            f = open(UPDATE_TIMESTAMP, "w")
            f.write("CMOR table updated at {}".format(time()))
            f.close()
            debug("Local CMOR table repository fetched or updated")
        except Exception as e:
            warning(
                "Attempt to update the cmor table repo and encountered an error: "
                + str(e))

    # Change repo branch in any case
    try:
        # Go into CMOR table path
        # Stash any changes from previous checkout
        # Checkout to the appropriate CMOR table tag
        # Go back to previous working directory
        checkedRun(
            ('cd {} && git stash --quiet && git checkout {} --quiet').format(
                cmor_table_path, ds_version))
        # Update local timestamp
        f = open(UPDATE_TIMESTAMP, "w")
        f.write("CMOR table updated at {}".format(time()))
        f.close()
        debug("Consider CMOR table tag: {}".format(ds_version))
    except Exception as e:
        raise ESGPublishError(
            "Error data_specs_version tag %s not found in the CMOR tables or other error.  Please contact support"
            % ds_version)

    # Get most up to date CMIP6_CV in any case
    if ds_version != "master":
        try:
            # Go into CMOR table path
            # PrePARE requires to have the most up to date CMIP6 CV.
            # Update CMIP6_CV.json from master branch.
            # Go back to previous working directory
            checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet'
                        ).format(cmor_table_path))
            debug("CMIP6 CV updated from master")
        except Exception as e:
            raise ESGPublishError(
                "Master branch does not exists or CMIP6_CV.json not found or other error.  Please contact support"
                % ds_version)
Beispiel #14
0
    def validateFile(self, fileobj):
        """
        for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file.
        If so, the file is declared valid. If not, file will go through PrePARE (CV) check.  PrePARE runs CFChecker

        Raises ESGPublishError if settings are missing or file fails the checks.
        Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.
        """

        validator = PrePARE.PrePARE

        f = fileobj.path

        # todo refactoring these could loaded upfront in the constructor
        config = getConfig()
        project_section = 'project:' + self.name
        project_config_section = 'config:' + self.name
        min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0")
        min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0")
        data_specs_version = config.get(project_config_section, "data_specs_version", default="master")
        cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH)
        force_validation = config.getboolean(project_config_section, "force_validation", default=False)
        cmor_table_subdirs = config.getboolean(project_config_section, "cmor_table_subdirs", default=False)

        if not force_validation:

            if self.replica:
                info("skipping PrePARE for replica (file %s)" % f)
                return

            try:
                file_cmor_version = fileobj.getAttribute('cmor_version', None)
            except:
                file_cmor_version = None
                debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f)

            passed_cmor = False
            if compareLibVersions(min_cmor_version, file_cmor_version):
                debug('File %s cmor-ized at version %s, passed!'%(f, file_cmor_version))
                passed_cmor = True

        try:
            table = fileobj.getAttribute('table_id', None)
        except:
            raise ESGPublishError("File %s missing required table_id global attribute" % f)

        try:
            variable_id = fileobj.getAttribute('variable_id', None)
        except:
            raise ESGPublishError("File %s missing required variable_id global attribute" % f)

        # data_specs_version drives CMOR table fetching
        # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini")
        # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini
        # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini

        try:
            file_data_specs_version = fileobj.getAttribute('data_specs_version', None)
        except Exception as e:
            raise ESGPublishError("File %s missing required data_specs_version global attribute"%f)

        if not compareLibVersions(min_ds_version, file_data_specs_version):
            raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version))
        # at this point the file has the correct data specs version.
        # if also was CMORized and has the correct version tag, we can exit

        if (not force_validation) and passed_cmor:
            return
            
        if data_specs_version == "file":
            data_specs_version = file_data_specs_version

        table_dir = getTableDir(cmor_table_path, data_specs_version, cmor_table_subdirs)
        debug("Validating {} using tables dir: {}".format(f, table_dir))

        try:
            process = validator.checkCMIP6(table_dir)
            if process is None:
                raise ESGPublishError("File %s failed the CV check - object create failure"%f)
            process.ControlVocab(f)
        except:
            raise ESGPublishError("File %s failed the CV check"%f)
Beispiel #15
0
def checkAndUpdateRepo(cmor_table_path, ds_version):
    """
        Checks for a file written to a predefined location.  if not present or too old, will pull the repo based on the input path argument and update the timestamp.
    """
    # This is run during handler initialization and not for each file validation

    # Pull repo if fetched more than one day ago
    # or if never fetched before
    if os.path.exists(UPDATE_TIMESTAMP):
        mtime = os.path.getmtime(UPDATE_TIMESTAMP)
        now = time()
        if now - mtime > (86400.0):
            pull_cmor_repo = True
        else:
            pull_cmor_repo = False
    else:
        pull_cmor_repo = True

    if pull_cmor_repo:
        try:
            # Go into CMOR table path
            # Git fetch CMOR table repo
            # Go back to previous working directory
            checkedRun(('cd {} && git fetch --quiet'
                        ).format(cmor_table_path))
            # Update local timestamp
            f = open(UPDATE_TIMESTAMP, "w")
            f.write("CMOR table updated at {}".format(time()))
            f.close()
            debug("Local CMOR table repository fetched or updated")
        except Exception as e :
            warning("Attempt to update the cmor table repo and encountered an error: " + str(e))

    # Change repo branch in any case
    try:
        # Go into CMOR table path
        # Stash any changes from previous checkout 
        # Checkout to the appropriate CMOR table tag
        # Go back to previous working directory
        checkedRun(('cd {} && git stash --quiet && git checkout {} --quiet'
                    ).format(cmor_table_path, ds_version))
        # Update local timestamp
        f = open(UPDATE_TIMESTAMP, "w")
        f.write("CMOR table updated at {}".format(time()))
        f.close()
        debug("Consider CMOR table tag: {}".format(ds_version))
    except Exception as e:
        raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error.  Please contact support"%ds_version)

    # Get most up to date CMIP6_CV in any case
    if ds_version != "master":
        try:
            # Go into CMOR table path
            # PrePARE requires to have the most up to date CMIP6 CV.
            # Update CMIP6_CV.json from master branch.
            # Go back to previous working directory
            checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet'
                        ).format(cmor_table_path))
            debug("CMIP6 CV updated from master")
        except Exception as e:
            raise ESGPublishError("Master branch does not exists or CMIP6_CV.json not found or other error.  Please contact support" % ds_version)