def deleteFilesVersion( dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, **context ): info("Deleting file entries for dataset: %s, version %d" % (dset.name, dsetVersion.version)) haveLatestDsetVersion = dsetVersion.version == dset.getVersion() # Create a file dictionary for the dataset fobjdict = {} # file version objects for the new dataset version for fobj in dsetVersion.getFileVersions(): fobjdict[fobj.location] = fobj nfiles = len(pathlist) varlocate = configOptions["variable_locate"] seq = 0 addNewDatasetVersion = False for path, size in pathlist: # If the file exists in the dataset, delete the file children (with cascade), and the file if fobjdict.has_key(path): fileVersionObj = fobjdict[path] info("Deleting entry for file %s" % path) # If this is the latest dataset version, remove the file variables and reaggregate ... if haveLatestDsetVersion: fileVersionObj.parent.deleteChildren(session) addNewDatasetVersion = True # ... otherwise just delete the membership of the file version in the dataset version else: fileVersionObj.deleteChildren(session) session.commit() del fobjdict[path] else: info("File entry not found: %s, skipping" % path) seq += 1 # Callback progress try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise return addNewDatasetVersion, fobjdict.values()
def renameFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, **context): info("Renaming files in dataset: %s, version %d"%(dset.name, dsetVersion.version)) # Get the list of FileVersion objects for this version locdict = {} todelete = {} for fobj in dsetVersion.getFileVersions(): loc = fobj.location locdict[loc] = todelete[loc] = fobj basedict = dset.getBaseDictionary() nfiles = len(pathlist) varlocate = configOptions['variable_locate'] seq = 0 for path, size in pathlist: # If the file exists, rename it oldpath = None if extraFields is not None: oldpath = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion) if oldpath is None: info("No from_file field for file %s, skipping"%path) continue if locdict.has_key(oldpath): fileVersionObj = locdict[oldpath] fileObj = fileVersionObj.parent if not os.path.exists(path): info("File not found: %s, skipping"%path) continue info("Renaming %s to %s"%(oldpath, path)) del basedict[fileObj.base] base = generateFileBase(path, basedict, dset.name) fileObj.base = base basedict[base] = 1 fileVersionObj.location = path del locdict[oldpath] locdict[path] = fileVersionObj else: info("File entry %s not found, skipping"%oldpath) continue seq += 1 # Callback progress try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise return False
def deleteFilesVersion(dset, dsetVersion, pathlist, session, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, **context): info("Deleting file entries for dataset: %s, version %d"%(dset.name, dsetVersion.version)) haveLatestDsetVersion = (dsetVersion.version == dset.getVersion()) # Create a file dictionary for the dataset fobjdict = {} # file version objects for the new dataset version for fobj in dsetVersion.getFileVersions(): fobjdict[fobj.location] = fobj nfiles = len(pathlist) varlocate = configOptions['variable_locate'] seq = 0 addNewDatasetVersion = False for path, size in pathlist: # If the file exists in the dataset, delete the file children (with cascade), and the file if fobjdict.has_key(path): fileVersionObj = fobjdict[path] info("Deleting entry for file %s"%path) # If this is the latest dataset version, remove the file variables and reaggregate ... if haveLatestDsetVersion: fileVersionObj.parent.deleteChildren(session) addNewDatasetVersion = True # ... otherwise just delete the membership of the file version in the dataset version else: fileVersionObj.deleteChildren(session) session.commit() del fobjdict[path] else: info("File entry not found: %s, skipping"%path) seq += 1 # Callback progress try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise return addNewDatasetVersion, fobjdict.values()
if parentId is None: parentIdent = handler.getParentId(datasetName) elif type(parentId)==type({}): parentIdent = parentId[datasetName] else: parentIdent = parentId messaging.info("Publishing: %s"%datasetName) dset, statusId, state, evname, status = publishDataset(datasetName, parentIdent, service, threddsRootURL, session, schema=schema, version=versionno) messaging.info(" Result: %s"%status.getStateItem()) results.append((dset, statusId, state)) resultDict[(datasetName,versionno)] = evname # Poll each dataset again j += 1 if state not in (PublicationState.PROCESSING, PublicationState.SUCCESSFUL): issueCallback(progressCallback, j*spi, n, 0, 1) continue for i in range(spi): if state==PublicationState.SUCCESSFUL: evname = PUBLISH_DATASET_EVENT event = Event(dset.name, dset.getVersion(), evname) dset.events.append(event) resultDict[(dset.name,versionno)] = evname issueCallback(progressCallback, j*spi, n, 0, 1) break elif state==PublicationState.PROCESSING: sleep(float(servicePollingDelay)) status = PublicationStatus(statusId, service) messaging.info(" Result: %s"%status.getStateItem()) state = status.getState()
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds]: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, **context): if replace: info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version)) else: info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version)) haveLatestDsetVersion = (dsetVersion.version == dset.getVersion()) # Get the list of FileVersion objects for this version locdict = {} todelete = {} for fobj in dsetVersion.getFileVersions(): loc = fobj.location locdict[loc] = todelete[loc] = fobj varlocate = configOptions['variable_locate'] checksumClient = configOptions['checksumClient'] checksumType = configOptions['checksumType'] # Get the base dictionary for the entire dataset basedict = dset.getBaseDictionary() # For each item in the pathlist: seq = 0 fileModified = False # Any file has been modified (added, replaced, or deleted) newFileVersionObjs = [] nfiles = len(pathlist) for path, sizet in pathlist: # Rescan this file if it has been added, or replaced rescanFile = haveLatestDsetVersion size, mtime=sizet csum = None csumtype = checksumType techNotes = None techNotesTitle = None datasetTechNotes = None datasetTechNotesTitle = None if extraFields is not None: csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion) csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion) techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None) techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None) datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None) datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None) if csum is None and not offline and checksumClient is not None: csum = checksum(path, checksumClient) csumtype = checksumType # Cache the dataset tech notes info for later use if datasetTechNotes is not None: dset.dataset_tech_notes = datasetTechNotes dset.dataset_tech_notes_title = datasetTechNotesTitle # Check if 'from_file' was specified for this file fromfile = None if extraFields is not None: fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion) if fromfile is None: oldpath = path else: frombase = os.path.basename(fromfile) tobase = os.path.basename(path) if frombase!=tobase: info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile)) oldpath = path else: oldpath = fromfile # If the item is in the current dataset version, get the file version obj and add to the list if locdict.has_key(oldpath): del todelete[oldpath] fileVersionObj = locdict[oldpath] fileObj = fileVersionObj.parent # If the file matches the existing file version, no-op, ... if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum): if not forceRescan: info("File %s exists, skipping"%path) newFileVersionObjs.append(fileVersionObj) rescanFile = False # ... else create a new version of the file else: if oldpath!=path: info("Replacing file %s"%oldpath) newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) newFileVersionObjs.append(newFileVersionObj) fileObj.deleteChildren(session) fileModified = True # Else create a new file / file version object and add to the list ... else: fileObj = FileFactory(dset, path, basedict, session) newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) newFileVersionObjs.append(newFileVersionObj) fileModified = True # ... and rescan if necessary if rescanFile or forceRescan: if not offline: info("Scanning %s"%path) f = handler.openPath(path) extractFromFile(dset, f, fileObj, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context) f.close() else: info("File %s is offline"%path) # Callback progress seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise # If updating, add the file version objects ... if not replace: for fileVersionObj in todelete.values(): newFileVersionObjs.append(fileVersionObj) # ... else if rescanning delete the file object children elif haveLatestDsetVersion: for fileVersionObj in todelete.values(): fileObj = fileVersionObj.parent fileObj.deleteChildren(session) fileModified = True # Create a new dataset version if: # - a file has been added, replaced, or deleted, and # - the current version is the latest createNewDatasetVersion = haveLatestDsetVersion and fileModified return createNewDatasetVersion, newFileVersionObjs
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, **context): fobjlist = [] # File objects in the dataset nfiles = len(pathlist) basedict = {} # file.base => 1 varlocate = configOptions['variable_locate'] checksumClient = configOptions['checksumClient'] checksumType = configOptions['checksumType'] seq = 0 for path, sizet in pathlist: size, mtime = sizet csum = None csumtype = checksumType techNotes = None techNotesTitle = None datasetTechNotes = None datasetTechNotesTitle = None if extraFields is not None: csum = extraFields.get((dset.name, -1, path, 'checksum'), None) csumtype = extraFields.get((dset.name, -1, path, 'checksum_type'), None) techNotes = extraFields.get((dset.name, -1, path, 'tech_notes'), None) techNotesTitle = extraFields.get((dset.name, -1, path, 'tech_notes_title'), None) datasetTechNotes = extraFields.get((dset.name, -1, path, 'dataset_tech_notes'), None) datasetTechNotesTitle = extraFields.get((dset.name, -1, path, 'dataset_tech_notes_title'), None) if csum is None and not offline and checksumClient is not None: csum = checksum(path, checksumClient) csumtype = checksumType # Cache the dataset tech notes info for later use if datasetTechNotes is not None: dset.dataset_tech_notes = datasetTechNotes dset.dataset_tech_notes_title = datasetTechNotesTitle # Create a file and version base = generateFileBase(path, basedict, dset.name) file = File(base, 'netCDF') basedict[base] = 1 fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) file.versions.append(fileVersion) fobjlist.append(fileVersion) seq += 1 dset.files.append(file) # Extract the dataset contents if not offline: info("Scanning %s"%path) f = handler.openPath(path) extractFromFile(dset, f, file, session, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, **context) f.close() else: info("File %s is offline"%path) # Callback progress try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise return True, fobjlist
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # list of all target variables of a dataset dset_target_vars = set() # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: if filevar.is_target_variable: dset_target_vars.add(filevar.short_name) # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def updateDatasetVersion(dset, dsetVersion, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, replace=False, forceRescan=False, useVersion=-1, **context): if replace: info("Replacing files in dataset: %s, version %d"%(dset.name, dsetVersion.version)) else: info("Updating files in dataset: %s, version %d"%(dset.name, dsetVersion.version)) haveLatestDsetVersion = (dsetVersion.version == dset.getVersion()) # Get the list of FileVersion objects for this version locdict = {} todelete = {} for fobj in dsetVersion.getFileVersions(): loc = fobj.location locdict[loc] = todelete[loc] = fobj varlocate = configOptions['variable_locate'] checksumClient = configOptions['checksumClient'] checksumType = configOptions['checksumType'] exclude_variables = configOptions['exclude_variables'] perVariable = configOptions['perVariable'] # Get the base dictionary for the entire dataset basedict = dset.getBaseDictionary() # For each item in the pathlist: seq = 0 fileModified = False # Any file has been modified (added, replaced, or deleted) newFileVersionObjs = [] nfiles = len(pathlist) for path, sizet in pathlist: # Rescan this file if it has been added, or replaced rescanFile = haveLatestDsetVersion size, mtime=sizet csum = None csumtype = checksumType techNotes = None techNotesTitle = None datasetTechNotes = None datasetTechNotesTitle = None if extraFields is not None: if useVersion != -1: csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None) csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None) else: csum = extraFieldsGet(extraFields, (dset.name, path, 'checksum'), dsetVersion) csumtype = extraFieldsGet(extraFields, (dset.name, path, 'checksum_type'), dsetVersion) techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None) techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None) datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None) datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None) if csum is None and not offline and checksumClient is not None: csum = checksum(path, checksumClient) csumtype = checksumType # Cache the dataset tech notes info for later use if datasetTechNotes is not None: dset.dataset_tech_notes = datasetTechNotes dset.dataset_tech_notes_title = datasetTechNotesTitle # Check if 'from_file' was specified for this file fromfile = None if extraFields is not None: fromfile = extraFieldsGet(extraFields, (dset.name, path, 'from_file'), dsetVersion) if fromfile is None: oldpath = path else: frombase = os.path.basename(fromfile) tobase = os.path.basename(path) if frombase!=tobase: info("Basenames are different for files: %s and %s. Ignoring 'from_file' option."%(path, fromfile)) oldpath = path else: oldpath = fromfile # If the item is in the current dataset version, get the file version obj and add to the list if locdict.has_key(oldpath): del todelete[oldpath] fileVersionObj = locdict[oldpath] fileObj = fileVersionObj.parent # If the file matches the existing file version, no-op, ... if os.path.exists(oldpath) and compareFiles(fileVersionObj, handler, path, size, offline, checksum=csum): if not forceRescan: info("File %s exists, skipping"%path) newFileVersionObjs.append(fileVersionObj) rescanFile = False # ... else create a new version of the file else: if oldpath!=path: info("Replacing file %s"%oldpath) newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) newFileVersionObjs.append(newFileVersionObj) fileObj.deleteChildren(session) fileModified = True # Else create a new file / file version object and add to the list ... else: fileObj = FileFactory(dset, path, basedict, session) newFileVersionObj = FileVersionFactory(fileObj, path, session, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) newFileVersionObjs.append(newFileVersionObj) fileModified = True # ... and rescan if necessary if rescanFile or forceRescan: if not offline: info("Scanning %s"%path) f = handler.openPath(path) extractFromFile(dset, f, fileObj, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context) f.close() else: info("File %s is offline"%path) # Callback progress seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise # If updating, add the file version objects ... if not replace: for fileVersionObj in todelete.values(): newFileVersionObjs.append(fileVersionObj) # ... else if rescanning delete the file object children elif haveLatestDsetVersion: for fileVersionObj in todelete.values(): fileObj = fileVersionObj.parent fileObj.deleteChildren(session) fileModified = True # Create a new dataset version if: # - a file has been added, replaced, or deleted, and # - the current version is the latest createNewDatasetVersion = haveLatestDsetVersion and fileModified return createNewDatasetVersion, newFileVersionObjs
def createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=None, offline=False, progressCallback=None, stopEvent=None, extraFields=None, masterGateway=None, useVersion=-1, **context): fobjlist = [] # File objects in the dataset nfiles = len(pathlist) basedict = {} # file.base => 1 varlocate = configOptions['variable_locate'] checksumClient = configOptions['checksumClient'] checksumType = configOptions['checksumType'] exclude_variables = configOptions['exclude_variables'] perVariable = configOptions['perVariable'] seq = 0 for path, sizet in pathlist: size, mtime = sizet csum = None csumtype = checksumType techNotes = None techNotesTitle = None datasetTechNotes = None datasetTechNotesTitle = None if extraFields is not None: csum = extraFields.get((dset.name, useVersion, path, 'checksum'), None) csumtype = extraFields.get((dset.name, useVersion, path, 'checksum_type'), None) techNotes = extraFields.get((dset.name, useVersion, path, 'tech_notes'), None) techNotesTitle = extraFields.get((dset.name, useVersion, path, 'tech_notes_title'), None) datasetTechNotes = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes'), None) datasetTechNotesTitle = extraFields.get((dset.name, useVersion, path, 'dataset_tech_notes_title'), None) if csum is None and not offline and checksumClient is not None: csum = checksum(path, checksumClient) csumtype = checksumType # Cache the dataset tech notes info for later use if datasetTechNotes is not None: dset.dataset_tech_notes = datasetTechNotes dset.dataset_tech_notes_title = datasetTechNotesTitle # Create a file and version base = generateFileBase(path, basedict, dset.name) file = File(base, 'netCDF') basedict[base] = 1 fileVersion = FileVersion(1, path, size, mod_time=mtime, checksum=csum, checksum_type=csumtype, tech_notes=techNotes, tech_notes_title=techNotesTitle) file.versions.append(fileVersion) fobjlist.append(fileVersion) seq += 1 dset.files.append(file) # Extract the dataset contents if not offline: info("Scanning %s"%path) f = handler.openPath(path) extractFromFile(dset, f, file, session, handler, cfHandler, aggdimName=aggregateDimensionName, varlocate=varlocate, exclude_variables=exclude_variables, perVariable=perVariable, **context) f.close() else: info("File %s is offline"%path) # Callback progress try: issueCallback(progressCallback, seq, nfiles, 0, 1, stopEvent=stopEvent) except: session.rollback() session.close() raise return True, fobjlist