コード例 #1
0
ファイル: extract.py プロジェクト: ncarenton/esg-publisher
def deleteFilesVersion(
    dset,
    dsetVersion,
    pathlist,
    session,
    cfHandler,
    configOptions,
    aggregateDimensionName=None,
    offline=False,
    progressCallback=None,
    stopEvent=None,
    extraFields=None,
    **context
):

    info("Deleting file entries for dataset: %s, version %d" % (dset.name, dsetVersion.version))

    haveLatestDsetVersion = dsetVersion.version == dset.getVersion()

    # Create a file dictionary for the dataset
    fobjdict = {}  # file version objects for the new dataset version
    for fobj in dsetVersion.getFileVersions():
        fobjdict[fobj.location] = fobj

    nfiles = len(pathlist)

    varlocate = configOptions["variable_locate"]
    seq = 0
    addNewDatasetVersion = False
    for path, size in pathlist:

        # If the file exists in the dataset, delete the file children (with cascade), and the file
        if fobjdict.has_key(path):
            fileVersionObj = fobjdict[path]
            info("Deleting entry for file %s" % path)

            # If this is the latest dataset version, remove the file variables and reaggregate ...
            if haveLatestDsetVersion:
                fileVersionObj.parent.deleteChildren(session)
                addNewDatasetVersion = True

            # ... otherwise just delete the membership of the file version in the dataset version
            else:
                fileVersionObj.deleteChildren(session)
                session.commit()
            del fobjdict[path]
        else:
            info("File entry not found: %s, skipping" % path)

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return addNewDatasetVersion, fobjdict.values()
コード例 #2
0
ファイル: extract.py プロジェクト: gavinmbell/esg-publisher
def renameFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, **context):

    info("Renaming files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    basedict = dset.getBaseDictionary()

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    for path, size in pathlist:

        # If the file exists, rename it
        oldpath = None
        if extraFields is not None:
            oldpath = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if oldpath is None:
            info("No from_file field for file %s, skipping"%path)
            continue

        if locdict.has_key(oldpath):
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            if not os.path.exists(path):
                info("File not found: %s, skipping"%path)
                continue
            info("Renaming %s to %s"%(oldpath, path))
            del basedict[fileObj.base]
            base = generateFileBase(path, basedict, dset.name)
            fileObj.base = base
            basedict[base] = 1
            fileVersionObj.location = path
            del locdict[oldpath]
            locdict[path] = fileVersionObj
        else:
            info("File entry %s not found, skipping"%oldpath)
            continue

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return False
コード例 #3
0
def renameFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, **context):

    info("Renaming files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    basedict = dset.getBaseDictionary()

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    for path, size in pathlist:

        # If the file exists, rename it
        oldpath = None
        if extraFields is not None:
            oldpath = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if oldpath is None:
            info("No from_file field for file %s, skipping"%path)
            continue

        if locdict.has_key(oldpath):
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            if not os.path.exists(path):
                info("File not found: %s, skipping"%path)
                continue
            info("Renaming %s to %s"%(oldpath, path))
            del basedict[fileObj.base]
            base = generateFileBase(path, basedict, dset.name)
            fileObj.base = base
            basedict[base] = 1
            fileVersionObj.location = path
            del locdict[oldpath]
            locdict[path] = fileVersionObj
        else:
            info("File entry %s not found, skipping"%oldpath)
            continue

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return False
コード例 #4
0
def deleteFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, **context):

    info("Deleting file entries for dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Create a file dictionary for the dataset
    fobjdict = {}                       # file version objects for the new dataset version
    for fobj in dsetVersion.getFileVersions():
        fobjdict[fobj.location] = fobj

    nfiles = len(pathlist)

    varlocate = configOptions['variable_locate']
    seq = 0
    addNewDatasetVersion = False
    for path, size in pathlist:

        # If the file exists in the dataset, delete the file children (with cascade), and the file
        if fobjdict.has_key(path):
            fileVersionObj = fobjdict[path]
            info("Deleting entry for file %s"%path)

            # If this is the latest dataset version, remove the file variables and reaggregate ...
            if haveLatestDsetVersion:
                fileVersionObj.parent.deleteChildren(session)
                addNewDatasetVersion = True

            # ... otherwise just delete the membership of the file version in the dataset version
            else:
                fileVersionObj.deleteChildren(session)
                session.commit()
            del fobjdict[path]
        else:
            info("File entry not found: %s, skipping"%path)

        seq += 1

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return addNewDatasetVersion, fobjdict.values()
コード例 #5
0
            if parentId is None:
                parentIdent = handler.getParentId(datasetName)
            elif type(parentId)==type({}):
                parentIdent = parentId[datasetName]
            else:
                parentIdent = parentId
            messaging.info("Publishing: %s"%datasetName)
            dset, statusId, state, evname, status = publishDataset(datasetName, parentIdent, service, threddsRootURL, session, schema=schema, version=versionno)
            messaging.info("  Result: %s"%status.getStateItem())
            results.append((dset, statusId, state))
            resultDict[(datasetName,versionno)] = evname

            # Poll each dataset again
            j += 1
            if state not in (PublicationState.PROCESSING, PublicationState.SUCCESSFUL):
                issueCallback(progressCallback, j*spi, n, 0, 1)
                continue

            for i in range(spi):
                if state==PublicationState.SUCCESSFUL:
                    evname = PUBLISH_DATASET_EVENT
                    event = Event(dset.name, dset.getVersion(), evname)
                    dset.events.append(event)
                    resultDict[(dset.name,versionno)] = evname
                    issueCallback(progressCallback, j*spi, n, 0, 1)
                    break
                elif state==PublicationState.PROCESSING:
                    sleep(float(servicePollingDelay))
                    status = PublicationStatus(statusId, service)
                    messaging.info("  Result: %s"%status.getStateItem())
                    state = status.getState()
コード例 #6
0
            if parentId is None:
                parentIdent = handler.getParentId(datasetName)
            elif type(parentId)==type({}):
                parentIdent = parentId[datasetName]
            else:
                parentIdent = parentId
            messaging.info("Publishing: %s"%datasetName)
            dset, statusId, state, evname, status = publishDataset(datasetName, parentIdent, service, threddsRootURL, session, schema=schema, version=versionno)
            messaging.info("  Result: %s"%status.getStateItem())
            results.append((dset, statusId, state))
            resultDict[(datasetName,versionno)] = evname

            # Poll each dataset again
            j += 1
            if state not in (PublicationState.PROCESSING, PublicationState.SUCCESSFUL):
                issueCallback(progressCallback, j*spi, n, 0, 1)
                continue

            for i in range(spi):
                if state==PublicationState.SUCCESSFUL:
                    evname = PUBLISH_DATASET_EVENT
                    event = Event(dset.name, dset.getVersion(), evname)
                    dset.events.append(event)
                    resultDict[(dset.name,versionno)] = evname
                    issueCallback(progressCallback, j*spi, n, 0, 1)
                    break
                elif state==PublicationState.PROCESSING:
                    sleep(float(servicePollingDelay))
                    status = PublicationStatus(statusId, service)
                    messaging.info("  Result: %s"%status.getStateItem())
                    state = status.getState()
コード例 #7
0
ファイル: extract.py プロジェクト: gavinmbell/esg-publisher
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds]:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
コード例 #8
0
ファイル: extract.py プロジェクト: gavinmbell/esg-publisher
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, **context):

    if replace:
        info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version))
    else:
        info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']

    # Get the base dictionary for the entire dataset
    basedict = dset.getBaseDictionary()

    # For each item in the pathlist:
    seq = 0
    fileModified = False                # Any file has been modified (added, replaced, or deleted)
    newFileVersionObjs = []
    nfiles = len(pathlist)
    for path, sizet in pathlist:

        # Rescan this file if it has been added, or replaced
        rescanFile = haveLatestDsetVersion

        size, mtime=sizet
        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion)
            csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion)
            techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Check if 'from_file' was specified for this file
        fromfile = None
        if extraFields is not None:
            fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if fromfile is None:
            oldpath = path
        else:
            frombase = os.path.basename(fromfile)
            tobase = os.path.basename(path)
            if frombase!=tobase:
                info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile))
                oldpath = path
            else:
                oldpath = fromfile

        # If the item is in the current dataset version, get the file version obj and add to the list
        if locdict.has_key(oldpath):
            del todelete[oldpath]
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            
            # If the file matches the existing file version, no-op, ...
            if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum):
                if not forceRescan:
                    info("File %s exists, skipping"%path)
                newFileVersionObjs.append(fileVersionObj)
                rescanFile = False

            # ... else create a new version of the file
            else:
                if oldpath!=path:
                    info("Replacing file %s"%oldpath)
                newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
                newFileVersionObjs.append(newFileVersionObj)
                fileObj.deleteChildren(session)
                fileModified = True

        # Else create a new file / file version object and add to the list ...
        else:
            fileObj = FileFactory(dset, path, basedict, session)
            newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
            newFileVersionObjs.append(newFileVersionObj)
            fileModified = True

        # ... and rescan if necessary
        if rescanFile or forceRescan:
            if not offline:
                info("Scanning %s"%path)
                f = handler.openPath(path)
                extractFromFile(dset, f, fileObj, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context)
                f.close()
            else:
                info("File %s is offline"%path)

        # Callback progress
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # If updating, add the file version objects ...
    if not replace:
        for fileVersionObj in todelete.values():
            newFileVersionObjs.append(fileVersionObj)

    # ... else if rescanning delete the file object children
    elif haveLatestDsetVersion:
        for fileVersionObj in todelete.values():
            fileObj = fileVersionObj.parent
            fileObj.deleteChildren(session)
            fileModified = True

    # Create a new dataset version if:
    # - a file has been added, replaced, or deleted, and
    # - the current version is the latest
    createNewDatasetVersion = haveLatestDsetVersion and fileModified
    
    return createNewDatasetVersion, newFileVersionObjs
コード例 #9
0
ファイル: extract.py プロジェクト: gavinmbell/esg-publisher
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, **context):

    fobjlist = []                       # File objects in the dataset
    nfiles = len(pathlist)

    basedict = {}                       # file.base => 1
    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    seq = 0
    for path, sizet in pathlist:
        size, mtime = sizet

        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFields.get((dset.name, -1, path, 'checksum'), None)
            csumtype = extraFields.get((dset.name, -1, path, 'checksum_type'), None)
            techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Create a file and version
        base = generateFileBase(path, basedict, dset.name)
        file = File(base, 'netCDF')
        basedict[base] = 1
        fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
        file.versions.append(fileVersion)
        fobjlist.append(fileVersion)
        seq += 1

        dset.files.append(file)

        # Extract the dataset contents
        if not offline:
            info("Scanning %s"%path)
            f = handler.openPath(path)
            extractFromFile(dset, f, file, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context)
            f.close()
        else:
            info("File %s is offline"%path)

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return True, fobjlist
コード例 #10
0
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None):
    """
    Aggregate file variables into variables, and add to the database. Populates the database tables:

    - variable
    - file_variable
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    dbSession
      A database Session.

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    cfHandler
      A CFHandler to validate standard names, etc.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    datasetInstance
      Existing dataset instance. If not provided, the instance is regenerated from the database.

    """

    session = dbSession()
    info("Aggregating variables")

    # Lookup the dataset
    if datasetInstance is None:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        for variable in dset.variables:
            session.delete(variable)
        for attrname, attr in dset.attributes.items():
            if not attr.is_category:
                del dset.attributes[attrname]
        session.commit()
        dset.variables = []
    else:
        dset = datasetInstance
        # session.save_or_update(dset)
        session.add(dset)
    if dset is None:
        raise ESGPublishError("Dataset not found: %s"%datasetName)

    dsetindex = {}                      # dsetindex[varname] = [(variable, domain), (variable, domain), ...]
                                        #   where domain = ((dim0, len0, 0), (dim1, len1, 1), ...)
                                        #   Note:
                                        #     (1) If a dim0 is the aggregate dimension, len0 is 0
                                        #     (2) A dsetindex entry will only have multiple tuples if
                                        #         there are more than one variable with the same name
                                        #         and different domains.
    varindex = {}                       # varindex[(varname, domain, attrname)] = attribute
    globalAttrIndex = {}                # globalAttrIndex[attname] = attval, for global attributes
    dsetvars = []

    # list of all target variables of a dataset
    dset_target_vars = set()

    # Create variables
    seq = 0
    nfiles = len(dset.getFiles())
    for file in dset.getFiles():
        for filevar in file.file_variables:
            if filevar.is_target_variable:
                dset_target_vars.add(filevar.short_name)

            # Get the filevar and variable domain
            fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions)
            fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ]))
            filevar.domain = fvdomain
            if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName:
                vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length
            else:
                vardomain = tuple(fvdomain)

            # Create the variable if necessary
            varlist = dsetindex.get(filevar.short_name, None)
            if varlist is None or vardomain not in [item[1] for item in varlist]:
                var = Variable(filevar.short_name, filevar.long_name)
                var.domain = vardomain

                # Record coordinate variable range if applicable
                if filevar.coord_type is not None:
                    var.coord_type = filevar.coord_type
                    if var.coord_type=='Z':
                        var.coord_values = filevar.coord_values
                    var.coord_range = filevar.coord_range
                    
                dsetvars.append(var)
                if varlist is None:
                    dsetindex[var.short_name] = [(var, vardomain)]
                else:
                    varlist.append((var, vardomain))
            else:
                for tvar, domain in varlist:
                    if domain==vardomain:
                        var = tvar
                        break

            # Attach the file variable to the variable
            var.file_variables.append(filevar)

            # Create attributes
            for fvattribute in filevar.attributes:
                vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None)
                if vattribute is None:
                    attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length)
                    var.attributes.append(attribute)
                    varindex[(var.short_name, vardomain, attribute.name)] = attribute
                    if attribute.name == 'units':
                        var.units = attribute.value

        # Create global attributes
        for fileattr in file.attributes:
            fattribute = globalAttrIndex.get(fileattr.name, None)
            if fattribute is None and fileattr.name not in ['readDimension']:
                attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length)
                dset.attributes[attribute.name] = attribute
                globalAttrIndex[attribute.name] = attribute
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Find the aggregation dimension bounds variable, if any
    aggDim = lookupVar(aggregateDimensionName, dsetindex)
    boundsName = lookupAttr(aggDim, 'bounds')
    aggUnits = lookupAttr(aggDim, 'units')
    aggDimBounds = lookupVar(boundsName, dsetindex)

    # Set calendar for time aggregation
    isTime = cfHandler.axisIsTime(aggDim)
    if isTime:
        calendar = cfHandler.getCalendarTag(aggDim)
        if calendar is None:
            calendar = "gregorian"
    else:
        calendar = None
    dset.calendar = calendar
    dset.aggdim_name = aggregateDimensionName
    dset.aggdim_units = aggUnits
    cdcalendar = cfHandler.tagToCalendar(calendar)

    # Add the non-aggregate dimension variables to the dataset
    for var in dsetvars:
        if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars:
            dset.variables.append(var)

    # Set coordinate ranges
    for var in dset.variables:
        for name, length, seq in var.domain:
            if name==aggregateDimensionName:
                continue
            dvar = lookupCoord(name, dsetindex, length)
            if dvar is not None:
                units = lookupAttr(dvar, 'units')
                if units is None:
                    warning("Missing units, variable=%s"%dvar.short_name)
                    units = ''
                if hasattr(dvar, 'coord_type'):
                    if dvar.coord_type=='X':
                        var.eastwest_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Y':
                        var.northsouth_range = dvar.coord_range+':'+units
                    elif dvar.coord_type=='Z':
                        var.updown_range = dvar.coord_range+':'+units
                        var.updown_values = dvar.coord_values

    # Attach aggregate dimension filevars to files
    if aggDim is not None:
        for filevar in aggDim.file_variables:
            filevar.file.aggDim = filevar
    if aggDimBounds is not None:
        for filevar in aggDimBounds.file_variables:
            filevar.file.aggDimBounds = filevar

    # Combine aggregate dimensions:
    # Scan all variables with the aggregate dimension in the domain. For each such variable,
    # create an aggregate dimension variable, and bounds if needed.
    timevars = []
    for var in dset.variables:
        if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]:
            aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName)
            aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName)
            if aggVar is not None:
                aggVar.units = aggUnits
                timevars.append(aggVar)
            if aggBoundsVar is not None:
                timevars.append(aggBoundsVar)

    # Create variable dimensions, aggregating the agg dimension
    debug("Creating dimensions")
    i = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain

        # Increment aggregate dimension length
        if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]:
            for filevar in var.file_variables:
                fvdomain = filevar.domain
                vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:])
        var.domain = vardomain

        # Create the variable domain
        for name, length, seq in vardomain:
            dimension = VariableDimension(name, length, seq)
            var.dimensions.append(dimension)
        i += 1
        try:
            issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Set variable aggregate dimension ranges
    debug("Setting aggregate dimension ranges")
    seq = 0
    nvars = len(dset.variables+timevars)
    for var in dset.variables+timevars:
        vardomain = var.domain
        if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName:

            # Adjust times so they have consistent base units
            try:
                filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables]
            except:
                for fv in var.file_variables:
                    try:
                        firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                        lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar)
                    except:
                        error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units))
                        raise

            mono = cmp(filevarRanges[0][1], filevarRanges[0][2])
            if mono<=0:
                filevarRanges.sort(lambda x, y: cmp(x[1], y[1]))
            else:
                filevarRanges.sort(lambda x, y: -cmp(x[1], y[1]))

            # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated.
            lastValues = numpy.array(map(lambda x: x[2], filevarRanges))
            firstValues = numpy.array(map(lambda x: x[1], filevarRanges))
            if (var not in [aggDim, aggDimBounds]):
                if mono<=0:
                    compare = (lastValues[0:-1] >= firstValues[1:])
                else:
                    compare = (lastValues[0:-1] <= firstValues[1:])
                if compare.any():
                    overlaps = compare.nonzero()[0]
                    dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE)
                    var.has_errors = True
                    nprint = min(len(overlaps), 3)
                    for i in range(nprint):
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE)
                        dset.warning("  %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE)
                    if len(overlaps)>nprint:
                        dset.warning("    ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE)

                # Check monotonicity of last values.
                else:
                    if mono<=0:
                        compare = (lastValues[0:-1] < lastValues[1:]).all()
                    else:
                        compare = (lastValues[0:-1] > lastValues[1:]).all()
                    if not compare:
                        dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE)
                        var.has_errors = True

            var.aggdim_first = float(firstValues[0])
            var.aggdim_last = float(lastValues[-1])
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # Combine identical aggregate dimensions and add to the dataset
    timevardict = {}
    for var in timevars:
        timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var

    for var in timevardict.values():
        dset.variables.append(var)
        
    # Validate standard names
    seq = 0
    nvars = len(dset.variables)
    for var in dset.variables:
        attr = lookupAttr(var, 'standard_name')
        if (attr is not None):
            if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)):
                info("Invalid standard name: %s for variable %s"%(attr, var.short_name))
            else:
                var.standard_name = attr
        seq += 1
        try:
            issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    debug("Adding variable info to database")
    session.commit()
    session.close()
コード例 #11
0
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, useVersion=-1, **context):

    if replace:
        info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version))
    else:
        info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version))

    haveLatestDsetVersion = (dsetVersion.version == dset.getVersion())

    # Get the list of FileVersion objects for this version
    locdict = {}
    todelete = {}
    for fobj in dsetVersion.getFileVersions():
        loc = fobj.location
        locdict[loc] = todelete[loc] = fobj

    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    exclude_variables = configOptions['exclude_variables']
    perVariable = configOptions['perVariable']

    # Get the base dictionary for the entire dataset
    basedict = dset.getBaseDictionary()

    # For each item in the pathlist:
    seq = 0
    fileModified = False                # Any file has been modified (added, replaced, or deleted)
    newFileVersionObjs = []
    nfiles = len(pathlist)
    for path, sizet in pathlist:

        # Rescan this file if it has been added, or replaced
        rescanFile = haveLatestDsetVersion

        size, mtime=sizet
        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            if useVersion != -1:
                csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None)
                csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None)
            else:
                csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion)
                csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion)
            techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Check if 'from_file' was specified for this file
        fromfile = None
        if extraFields is not None:
            fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion)
        if fromfile is None:
            oldpath = path
        else:
            frombase = os.path.basename(fromfile)
            tobase = os.path.basename(path)
            if frombase!=tobase:
                info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile))
                oldpath = path
            else:
                oldpath = fromfile

        # If the item is in the current dataset version, get the file version obj and add to the list
        if locdict.has_key(oldpath):
            del todelete[oldpath]
            fileVersionObj = locdict[oldpath]
            fileObj = fileVersionObj.parent
            
            # If the file matches the existing file version, no-op, ...
            if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum):
                if not forceRescan:
                    info("File %s exists, skipping"%path)
                newFileVersionObjs.append(fileVersionObj)
                rescanFile = False

            # ... else create a new version of the file
            else:
                if oldpath!=path:
                    info("Replacing file %s"%oldpath)
                newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
                newFileVersionObjs.append(newFileVersionObj)
                fileObj.deleteChildren(session)
                fileModified = True

        # Else create a new file / file version object and add to the list ...
        else:
            fileObj = FileFactory(dset, path, basedict, session)
            newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
            newFileVersionObjs.append(newFileVersionObj)
            fileModified = True

        # ... and rescan if necessary
        if rescanFile or forceRescan:
            if not offline:
                info("Scanning %s"%path)
                f = handler.openPath(path)
                extractFromFile(dset, f, fileObj, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context)
                f.close()
            else:
                info("File %s is offline"%path)

        # Callback progress
        seq += 1
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    # If updating, add the file version objects ...
    if not replace:
        for fileVersionObj in todelete.values():
            newFileVersionObjs.append(fileVersionObj)

    # ... else if rescanning delete the file object children
    elif haveLatestDsetVersion:
        for fileVersionObj in todelete.values():
            fileObj = fileVersionObj.parent
            fileObj.deleteChildren(session)
            fileModified = True

    # Create a new dataset version if:
    # - a file has been added, replaced, or deleted, and
    # - the current version is the latest
    createNewDatasetVersion = haveLatestDsetVersion and fileModified

    return createNewDatasetVersion, newFileVersionObjs
コード例 #12
0
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, useVersion=-1, **context):

    fobjlist = []                       # File objects in the dataset
    nfiles = len(pathlist)

    basedict = {}                       # file.base => 1
    varlocate = configOptions['variable_locate']
    checksumClient = configOptions['checksumClient']
    checksumType = configOptions['checksumType']
    exclude_variables = configOptions['exclude_variables']
    perVariable = configOptions['perVariable']

    seq = 0
    for path, sizet in pathlist:
        size, mtime = sizet

        csum = None
        csumtype = checksumType
        techNotes = None
        techNotesTitle = None
        datasetTechNotes = None
        datasetTechNotesTitle = None
        if extraFields is not None:
            csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None)
            csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None)
            techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None)
            techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None)
            datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None)
            datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None)
        if csum is None and not offline and checksumClient is not None:
            csum = checksum(path, checksumClient)
            csumtype = checksumType

        # Cache the dataset tech notes info for later use
        if datasetTechNotes is not None:
            dset.dataset_tech_notes = datasetTechNotes
            dset.dataset_tech_notes_title = datasetTechNotesTitle

        # Create a file and version
        base = generateFileBase(path, basedict, dset.name)
        file = File(base, 'netCDF')
        basedict[base] = 1
        fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle)
        file.versions.append(fileVersion)
        fobjlist.append(fileVersion)
        seq += 1

        dset.files.append(file)

        # Extract the dataset contents
        if not offline:
            info("Scanning %s"%path)
            f = handler.openPath(path)
            extractFromFile(dset, f, file, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context)
            f.close()
        else:
            info("File %s is offline"%path)

        # Callback progress
        try:
            issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent)
        except:
            session.rollback()
            session.close()
            raise

    return True, fobjlist