def checkAndUpdateRepo(cmor_table_path, handler, ds_version): """ Checks for a file written to a predefined location. if not present or too old, will pull the repo based on the input path argument and update the timestamp. """ pull_cmor_repo = False if os.path.exists(UPDATE_TIMESTAMP): mtime = os.path.getmtime(UPDATE_TIMESTAMP) now = time() if now - mtime > (86400.0): pull_cmor_repo = True else: pull_cmor_repo = True if pull_cmor_repo: try: os.system("pushd "+cmor_table_path+" ; git pull ; popd") f = open(UPDATE_TIMESTAMP, "w") f.write("t") f.close() except Exception as e : warning("Attempt to update the cmor table repo and encountered an error: " + str(e)) if handler.data_specs_version != ds_version: try: os.system("pushd "+cmor_table_path+" ; git checkout "+ds_version+ " ; popd") handler.set_spec_version(ds_version) except Exception as e: raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error. Please contact support"%ds_version)
def get_citation_url(self, project_config_section, config, dataset_name, dataset_version, test_publication): """ Returns the citation_url if a project uses citation, otherwise returns None project_section The name of the project section in the ini file config The configuration (ini files) dataset_name Name of the dataset dataset_version Version of the dataset """ if config.has_option(project_config_section, 'citation_url'): try: pattern = self.getFilters(option='dataset_id') attributes = re.match(pattern[0], dataset_name).groupdict() if 'version' not in attributes: attributes['version'] = str(dataset_version) if 'dataset_id' not in attributes: attributes['dataset_id'] = dataset_name return config.get(project_config_section, 'citation_url', 0, attributes) except: warning('Unable to generate a citation url for %s' % dataset_name) return None else: return None
def getHandler(path, Session, validate=True, **extra_args): """ Get a project handler from a file. The project is determined by trying to create each registered handler using the file. path String path of the file to read project info from. Session SQLAlchemy Session validate If True, create a validating handler which will raise an error if an invalid field value is read or input. any other keyword args are passed to the handler """ found = False items = projectRegistry.items() items.sort(lambda x, y: cmp(projectRegistry.order(x[0]), projectRegistry.order(y[0]))) for name, cls in items: try: handler = instantiateHandler(cls, name, path, Session, validate, **extra_args) except ESGInvalidMetadataFormat: continue found = True break if not found: warning('No project handler found for file %s'%path) handler = None return handler
def getHandler(path, Session, validate=True): """ Get a project handler from a file. The project is determined by trying to create each registered handler using the file. path String path of the file to read project info from. Session SQLAlchemy Session validate If True, create a validating handler which will raise an error if an invalid field value is read or input. """ found = False items = projectRegistry.items() items.sort(lambda x, y: cmp(projectRegistry.order(x[0]), projectRegistry.order(y[0]))) for name, cls in items: try: handler = cls(name, path, Session, validate) except ESGInvalidMetadataFormat: continue found = True break if not found: warning('No project handler found for file %s' % path) handler = None return handler
def register(self, projectName, moduleName, className): try: __import__(moduleName) except: warning('Cannot import handler %s:%s for project %s'%(moduleName, className, projectName)) m = sys.modules[moduleName] cls = m.__dict__.get(className) if cls is None: warning('No such class in %s: %s'%(moduleName, className)) self.registry[projectName] = cls
def updateDatasetFromContext(context, datasetName, Session): """ Update a persistent dataset with values from context (name/value dictionary). The context may have fields such as event fields, not associated with the project handler. context A property (name/value) dictionary. datasetName String dataset identifier. Session Database session factory. """ dset = Dataset.lookup(datasetName, Session) if dset is None: raise ESGQueryError("Dataset not found: %s" % datasetName) projectName = dset.get_project(Session) handler = getHandlerByName(projectName, None, Session) basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields( handler, return_list=False) properties = context.copy() # Set basic and event properties session = Session() session.add(dset) for key, value in properties.items(): if key in basicHeaders: if key != 'id': if key == 'name': if len(handler.parseDatasetName(value, {})) == 0: warning( "Dataset name: %s does not match dataset_id pattern in config file." % value) setattr(dset, key, value) else: warning("Cannot update id field") del properties[key] elif key in eventHeaders: event = dset.events[-1] setEvent(event, key, value) del properties[key] # Set attribute headers handler.setContext(properties) handler.saveContext(datasetName, Session) session.commit() session.close()
def register(self, projectName, moduleName, className): try: __import__(moduleName) except: warning('Cannot import handler %s:%s for project %s' % (moduleName, className, projectName)) m = sys.modules[moduleName] cls = m.__dict__.get(className) if cls is None: warning('No such class in %s: %s' % (moduleName, className)) self.registry[projectName] = cls
def loadEntryPoints(self): """ Get the entry points for the entry point group associated with this registry, and build an entry point dictionary. """ optionDict = {} distPlugins = { } # distPlugins: entry_point_distribution => distribution_dict # where distribution_dict: entry_point_name => handler_class for ep in iter_entry_points(self.entryPointGroup): if distPlugins.has_key(ep.dist): distPlugins[ep.dist][ep.name] = ep else: distPlugins[ep.dist] = {ep.name: ep} for dist, v in distPlugins.items(): if v.has_key(HANDLER_NAME_ENTRY_POINT): if v.has_key(HANDLER_ENTRY_POINT): handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[ handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)) else: optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True) else: warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT)) elif v.has_key(HANDLER_DICT_ENTRY_POINT): handlerDict = v[HANDLER_DICT_ENTRY_POINT].load() for handlerName, handlerClassName in handlerDict.items(): if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[ handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName)) else: optionDict[handlerName] = (handlerClassName, dist, False) return optionDict
def validateFile(self, fileobj): """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.""" if not fileobj.hasAttribute('project_id'): result = False message = "No global attribute: project_id" else: project_id = fileobj.getAttribute('project_id', None) result = (project_id[:5]=="CMIP5") message = "project_id should be 'CMIP5'" if not result: if (WARN) : warning(message) else: raise ESGInvalidMetadataFormat(message)
def validateFile(self, fileobj): """Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler.""" if not fileobj.hasAttribute('project_id'): result = False message = "No global attribute: project_id" else: project_id = fileobj.getAttribute('project_id', None) result = (project_id[:5] == "CMIP5") message = "project_id should be 'CMIP5'" if not result: if (WARN): warning(message) else: raise ESGInvalidMetadataFormat(message)
def getHessianServiceURL(): """Get the configured value of hessian_service_url""" config = getConfig() serviceURL = config.get('DEFAULT', 'hessian_service_url') gatewayServiceRoot = os.environ.get('ESG_GATEWAY_SVC_ROOT', None) if gatewayServiceRoot is not None: dum, serviceHost, dum, dum, dum, dum = urlparse.urlparse(serviceURL) dum, envServiceHost, dum, dum, dum, dum = urlparse.urlparse('http://'+gatewayServiceRoot) if serviceHost!=envServiceHost: warning("hessian_service_url=%s but environment variable ESG_GATEWAY_SVC_ROOT=%s, please reconcile these values"%(serviceURL, gatewayServiceRoot)) return serviceURL
def readContext(self, fileInstance, **kw): """Get a dictionary of attribute/value pairs from an open file. Returns a dictionary of attribute/value pairs, which are added to the handler context. fileInstance Format handler instance representing the opened file, an instance of FormatHandler or a subclass. kw Optional keyword arguments. """ result = {} f = fileInstance.file result = IPCC5Handler.readContext(self, fileInstance, **kw) if 'project' not in result: result['project'] = self.name # Parse CMOR table. if hasattr(f, 'table_id'): tableId = getattr(f, 'table_id') fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields) > 1: table = fields[1] result['cmor_table'] = table else: result['cmor_table'] = 'noTable' # Read categories as file attributes, for values not already set for category in self.getFieldNames(): if category not in result and hasattr(f, category): result[category] = getattr(f, category) # Check if mandatory fields are set if self.isMandatory(category) and category not in result: warning( "Mandatory category %s not set for file %s, use -p option?" % (category, fileInstance.path)) # Check validity self.validateContext(result) # Return the attribute/value dictionary return result
def readContext(self, fileInstance, **kw): """Get a dictionary of attribute/value pairs from an open file. Returns a dictionary of attribute/value pairs, which are added to the handler context. fileInstance Format handler instance representing the opened file, an instance of FormatHandler or a subclass. kw Optional keyword arguments. """ result = {} f = fileInstance.file result = IPCC5Handler.readContext(self, fileInstance, **kw) if 'project' not in result: result['project'] = self.name # Parse CMOR table. if hasattr(f, 'table_id'): tableId = getattr(f, 'table_id') fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields)>1: table = fields[1] result['cmor_table'] = table else: result['cmor_table'] = 'noTable' # Read categories as file attributes, for values not already set for category in self.getFieldNames(): if category not in result and hasattr(f, category): result[category] = getattr(f, category) # Check if mandatory fields are set if self.isMandatory(category) and category not in result: warning("Mandatory category %s not set for file %s, use -p option?"%(category, fileInstance.path)) # Check validity self.validateContext(result) # Return the attribute/value dictionary return result
def loadEntryPoints(self): """ Get the entry points for the entry point group associated with this registry, and build an entry point dictionary. """ optionDict = {} distPlugins = {} # distPlugins: entry_point_distribution => distribution_dict # where distribution_dict: entry_point_name => handler_class for ep in iter_entry_points(self.entryPointGroup): if distPlugins.has_key(ep.dist): distPlugins[ep.dist][ep.name] = ep else: distPlugins[ep.dist] = {ep.name: ep} for dist, v in distPlugins.items(): if v.has_key(HANDLER_NAME_ENTRY_POINT): if v.has_key(HANDLER_ENTRY_POINT): handlerName = v[HANDLER_NAME_ENTRY_POINT].module_name if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName) ) else: optionDict[handlerName] = (v[HANDLER_ENTRY_POINT], dist, True) else: warning("Distribution %s does not define a %s option." % (k, HANDLER_ENTRY_POINT)) elif v.has_key(HANDLER_DICT_ENTRY_POINT): handlerDict = v[HANDLER_DICT_ENTRY_POINT].load() for handlerName, handlerClassName in handlerDict.items(): if optionDict.has_key(handlerName): handlerValue = v[HANDLER_ENTRY_POINT] handlerClassName, prevDist, mustload = optionDict[handlerName] if handlerValue != handlerClassName: error( "Conflicting handler names found:\n In distribution %s, %s => (%s);\n In distribution %s, %s => (%s)\n To remove the error uninstall one of the packages with 'easy_install -mxN package_name'." % (dist, handlerName, handlerValue, prevDist, handlerName, handlerClassName) ) else: optionDict[handlerName] = (handlerClassName, dist, False) return optionDict
def check_pid_avail(self, project_config_section, config, version=None): """ Returns the pid_prefix project_config_section The name of the project config section in esg.ini config The configuration (ini files) version Integer or Dict with dataset versions """ # disable PIDs for local index without versioning (IPSL use case) if isinstance(version, int) and not version_pattern.match(str(version)): warning('Version %s, skipping PID generation.' % version) return None return '21.14100'
def getDerived(dset, dsetVersion, derivedHeaders, handler): result = [] for attname in derivedHeaders: if attname=='version': value = str(dsetVersion.version) elif attname=='parent': dsetname = dset.name try: value = handler.getParentId(dsetname) except: warning("Cannot determine parent id for dataset %s"%dsetname) value = '' elif attname=='version_name': value = dsetVersion.name elif attname=='comment': value = dsetVersion.comment result.append(value) return result
def getDerived(dset, dsetVersion, derivedHeaders, handler): result = [] for attname in derivedHeaders: if attname == 'version': value = str(dsetVersion.version) elif attname == 'parent': dsetname = dset.name try: value = handler.getParentId(dsetname) except: warning("Cannot determine parent id for dataset %s" % dsetname) value = '' elif attname == 'version_name': value = dsetVersion.name elif attname == 'comment': value = dsetVersion.comment result.append(value) return result
def readContext(self, cdfile, model=''): "Get a dictionary of keys from an open file" result = BasicHandler.readContext(self, cdfile) f = cdfile.file for key, value in cmorAttributes.items(): try: result[key] = getattr(f, value) if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray: res = str(result[key][0]) if key=='run_name': if res[0:3]!='run': res = 'run'+res result[key] = res except: pass if 'realization' in result and 'initialization_method' in result and 'physics_version' in result: ensemble = 'r%si%sp%s'%(result['realization'], result['initialization_method'], result['physics_version']) result['ensemble'] = ensemble result['run_name'] = ensemble base = os.path.basename(cdfile.path) try: index = base.index('_') varname = base[0:index] result['variable'] = varname except: warning("File path must have the form varname_XXX: %s"%cdfile.path) if not result.has_key('product'): result['product'] = 'output' self.mapEnumeratedValues(result) # If realm has multiple fields, pick the first one if 'realm' in result: realm = result['realm'].strip() if realm.find(' ')!=-1: realms = realm.split(' ') result['realm'] = realms[0] # Parse CMOR table. if 'table_id' in result: tableId = result['table_id'] fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields)>1 and (fields[1] in cmorTables): table = fields[1] result['cmor_table'] = table else: result['cmor_table'] = 'noTable' else: result['cmor_table'] = 'noTable' # Parse the product if it is unresolved if result['product']=='output': cmor_table = result['cmor_table'] variable = result.get('variable', None) experiment = result.get('experiment', None) dateRange = self.getDateRangeFromPath() year1 = dateRange[0][0] year2 = dateRange[1][0] if year2 is None: year2 = year1 result['product'] = getProduct(cmor_table, variable, experiment, year1, year2) validateDRSFieldValues(result, cdfile) return result
def extractFromFile(dataset, openfile, fileobj, session, cfHandler, aggdimName=None, varlocate=None, **context): """ Extract metadata from a file, add to a database. dataset The dataset instance. openfile An open netCDF file object. fileobj A (logical) file instance. session A database session instance. cfHandler A CF handler instance aggdimName The name of the dimension which is split across files, if any. varlocate List with elements [varname, pattern]. The variable will be extracted from the file only if the filename matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']] context A dictionary with keys project, model, experiment, and run. """ fileVersion = fileobj.versions[-1] # Get the aggregate dimension range if aggdimName is not None and openfile.hasVariable(aggdimName): aggvarFirst = openfile.getVariable(aggdimName, index=0) aggvarLast = openfile.getVariable(aggdimName, index=-1) aggvarLen = openfile.inquireVariableShape(aggdimName)[0] aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName)) if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"): if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12: dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE) if aggdimName is not None and not openfile.hasVariable(aggdimName): info("Aggregate dimension not found: %s"%aggdimName) varlocatedict = {} if varlocate is not None: for varname, pattern in varlocate: varlocatedict[varname] = pattern # For each variable in the file: for varname in openfile.inquireVariableList(): varshape = openfile.inquireVariableShape(varname) debug("%s%s"%(varname, `varshape`)) # Check varlocate if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname], os.path.basename(fileVersion.location)): debug("Skipping variable %s in %s"%(varname, fileVersion.location)) continue # Create a file variable filevar = FileVariable(varname, openfile.getAttribute('long_name', varname, None)) fileobj.file_variables.append(filevar) # Create attributes: for attname in openfile.inquireAttributeList(varname): attvalue = openfile.getAttribute(attname, varname) atttype, attlen = getTypeAndLen(attvalue) attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen) filevar.attributes.append(attribute) debug(' %s.%s = %s'%(varname, attname, `attvalue`)) # Create dimensions seq = 0 dimensionList = openfile.inquireVariableDimensions(varname) for dimname, dimlen in zip(dimensionList, varshape): dimension = FileVariableDimension(dimname, dimlen, seq) filevar.dimensions.append(dimension) if dimname==aggdimName: filevar.aggdim_first = float(aggvarFirst) filevar.aggdim_last = float(aggvarLast) filevar.aggdim_units = aggvarunits seq += 1 # Set coordinate axis range and type if applicable if len(varshape)==1: var0 = openfile.getVariable(varname, index=0) varn = openfile.getVariable(varname, index=-1) if cfHandler.axisIsLatitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Y' elif cfHandler.axisIsLongitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'X' elif cfHandler.axisIsLevel(filevar): vararray = openfile.getVariable(varname) filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Z' filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above # Create global attribute for attname in openfile.inquireAttributeList(): attvalue = openfile.getAttribute(attname, None) atttype, attlen = getTypeAndLen(attvalue) attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen) fileobj.attributes.append(attribute) if attname=='tracking_id': fileVersion.tracking_id = attvalue debug('.%s = %s'%(attname, attvalue))
def start_harvest(self, parent): from esgcet.publish import publishDatasetList from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue', 0.8) # Make sure the publisher is logged in # if not self.parent.parent.password_flg: # self.parent.parent.menu.login_menu.evt_login( self.parent.parent ) # Start the busy routine to indicate to the users something is happening self.parent.parent.busyCursor = 'watch' self.parent.parent.busyWidgets = [ self.parent.parent.pane2.pane('EditPaneTop'), self.parent.parent.pane2.pane('EditPaneBottom'), self.parent.parent.pane2.pane('EditPaneStatus'), self.parent.parent.pane.pane('ControlPane') ] pub_busy.busyStart(self.parent.parent) try: # Generate the list of datasets to be published datasetNames = [] GUI_line = {} tab_name = self.parent.parent.top_notebook.getcurselection() selected_page = self.parent.parent.main_frame.selected_top_page if (selected_page is None): warning( "Must generate a list of datasets to scan before publishing can occur." ) pub_busy.busyEnd(self.parent.parent) return for x in self.parent.parent.main_frame.top_page_id[selected_page]: if self.parent.parent.main_frame.top_page_id[selected_page][x].cget( 'bg' ) != 'salmon' and self.parent.parent.main_frame.top_page_id2[ selected_page][x].cget('bg') != 'salmon': dset_name = self.parent.parent.main_frame.top_page_id2[ selected_page][x].cget('text') ####################################### # ganz added this 1/18/11 versionNum = self.parent.parent.main_frame.version_label[ selected_page][x].cget('text') dsetTuple = (dset_name, versionNum) #dsetName = generateDatasetVersionId(dsetTuple) ##################################################################################### # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary datasetNames.append(dsetTuple) GUI_line[dset_name] = x else: if self.parent.parent.main_frame.top_page_id2[ selected_page][x].cget('bg') == 'salmon': self.parent.parent.main_frame.top_page_id[ selected_page][x].configure(relief='raised', background='salmon', image=self.off) # Publish collection of datasets testProgress = (self.parent.parent.statusbar.show, 0, 100) publishThredds = (quality_control_widgets.get_CheckBox3() == 1) publishGateway = (quality_control_widgets.get_CheckBox2() == 1) if (publishThredds): print 'publishing to Thredds' if (publishGateway): print 'publishing to Gateway' status_dict = publishDatasetList(datasetNames, self.Session, publish=publishGateway, thredds=publishThredds, progressCallback=testProgress) # Show the published status for x in status_dict.keys(): status = status_dict[x] dsetName, versionNo = x dsetVersionName = generateDatasetVersionId(x) guiLine = GUI_line[dsetName] # dsetVersionName] self.parent.parent.main_frame.status_label[selected_page][ guiLine].configure( text=pub_controls.return_status_text(status)) dset = Dataset.lookup(dsetName, self.Session) if dset.has_warnings(self.Session): warningLevel = dset.get_max_warning_level(self.Session) if warningLevel >= ERROR_LEVEL: buttonColor = "pink" buttonText = "Error" else: buttonColor = "yellow" buttonText = "Warning" self.parent.parent.main_frame.ok_err[selected_page][ guiLine].configure( text=buttonText, bg=buttonColor, relief='raised', command=pub_controls.Command( self.parent.parent.pub_buttonexpansion. extraction_widgets.error_extraction_button, dset)) else: self.parent.parent.main_frame.ok_err[selected_page][ guiLine].configure( text='Ok', bg=dcolor1, highlightcolor=dcolor1, relief='sunken', ) except: pub_busy.busyEnd( self.parent.parent ) # catch here in order to turn off the busy cursor ganz raise finally: pub_busy.busyEnd(self.parent.parent) self.my_refresh()
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:' + self.name dataset_id_formats = splitLine( config.get(section, 'dataset_id', raw=True)) idfields = [ re.findall(_patpat, format) for format in dataset_id_formats ] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1] == '/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId( 'dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s' % drsversion except: allfields = reduce(lambda x, y: set(x) + set(y), idfields) missingFields = list((set(allfields) - set(groupdict.keys())) - set(config.options(section))) raise ESGPublishError( "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s" % ( ` missingFields `, nodepath)) else: warning( "Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini" ) datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0): warning( "Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini" ) return datasetMap
def new_query_page(self, parent, tab_name=None, query_id=None): # Start the busy routine to indicate to the users something is happening self.parent.parent.busyCursor = "watch" self.parent.parent.busyWidgets = [ self.parent.parent.pane2.pane("EditPaneTop"), self.parent.parent.pane2.pane("EditPaneBottom"), self.parent.parent.pane2.pane("EditPaneStatus"), self.parent.parent.pane.pane("ControlPane"), ] pub_busy.busyStart(self.parent.parent) try: properties = {} projectName = self.parent.query_fields["project"].get() # Must have projectName handler = getHandlerByName(projectName, None, self.Session) tabcolor = Pmw.Color.changebrightness(self.parent.parent, pub_controls.query_tab_color, 0.6) # works up to here if query_id is None: for x in self.parent.query_fields.keys(): query_string = self.parent.query_fields[x].get().lstrip() if (query_string == "-Any-") or (len(query_string) == 0): properties[x] = (2, "%") elif query_string != "-Any-": properties[x] = (1, query_string) if properties["id"] == (2, "%"): del properties["id"] # This causes an error because you cannot modify the 'id' listProperties = False result, headers = queryDatasets(projectName, handler, self.Session, properties) # works up to here # running this causes it to fail! self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=result, list_fields=headers, ) else: result, headers = queryDatasets(projectName, handler, self.Session, properties) for x in result: query_id_found = False if query_id == x[0][:-1]: self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=[x], list_fields=headers, ) query_id_found = True break if query_id_found is False: warning("The specified dataset id '%s' was not found.", query_id) # fails here # Enable the "Data Publication" button self.parent.ControlButton3.configure(state="normal") datasetNames = [] for x in result: datasetNames.append(x[1]) dmap, offline_map, extraFields = queryDatasetMap(datasetNames, self.Session, extra_fields=True) # Check if offline or not, then set the iteration values for each page selected_page = self.parent.parent.main_frame.selected_top_page self.parent.parent.hold_offline[selected_page] = offline_map self.parent.parent.main_frame.projectName[selected_page] = projectName self.parent.parent.main_frame.dmap[selected_page] = dmap self.parent.parent.main_frame.extraFields[selected_page] = extraFields self.parent.parent.main_frame.datasetMapfile[selected_page] = None self.parent.parent.directoryMap[selected_page] = None self.parent.parent.main_frame.dirp_firstfile[selected_page] = None self.parent.parent.defaultGlobalValues[selected_page] = {} except: pub_busy.busyEnd(self.parent.parent) # catch here in order to turn off the busy cursor ganz raise finally: pub_busy.busyEnd(self.parent.parent)
def return_content2(self, appendOpt=False): from esgcet.publish import iterateOverDatasets, processIterator from esgcet.config import getHandlerByName from esgcet.model import eventName from esgcet.config import loadConfig # Initialize parameters for interating over datasets initcontext = {} aggregateOnly = False # appendOpt = False initcontext = {} properties = {} publish = False publishOnly = False thredds = False testProgress1 = [self.parent.parent.statusbar.show, 0, 50] testProgress2 = [self.parent.parent.statusbar.show, 50, 100] handlerDictionary = {} # Get the currently selected tab and the selected datasets tab_name = self.parent.parent.top_notebook.getcurselection() selected_page = self.parent.parent.main_frame.selected_top_page datasetNames = [] # datasetNames2 = [] if (selected_page is None): warning("Must generate a list of datasets to scan before data extraction can occur.") return if (selected_page is not None) or (self.parent.parent.hold_offline[selected_page] == True): extraFields = None if (self.parent.parent.hold_offline[selected_page] == False) or (isinstance(self.parent.parent.hold_offline[selected_page], types.DictType)): for x in self.parent.parent.main_frame.top_page_id[selected_page]: dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') # GANZ TODO version_label # ganz added this 1/21/11 if (self.parent.parent.main_frame.version_label[selected_page] ): dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') dsetVersion = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') ##################################################################################### else: dset_name, dsetVersion = parseDatasetVersionId(dsetVersionName) # Retrieve all the datasets in the collection for display """ ganz test code status = pollDatasetPublicationStatus(dset_name, self.Session) status_text = pub_controls.return_status_text( status ) if status_text != 'Error': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames2.append(dsetTuple) """ # Retrieve only the datasets that have been selected if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames.append(dsetTuple) dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = self.parent.parent.directoryMap[selected_page] if dmap is not None: for x in datasetNames: dsetId = x[0] datasetName = x try: dmapentry = dmap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = dmap[trykey] del dmap[trykey] dmap[datasetName] = dmapentry firstFile = dmapentry[0][0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] # Copy the defaultGlobalValues into initcontext initcontext = self.parent.parent.main_frame.defaultGlobalValues[selected_page] else: # more test code myholdDirectoryMap = self.parent.parent.directoryMap[selected_page] #mydatasetNames = [(item,-1) for item in myholdDirectoryMap.keys()] mydatasetNames = [(item) for item in myholdDirectoryMap.keys()] #end for x in mydatasetNames: dsetId = x[0] datasetName = x # ganz this is test code try: dmapentry = myholdDirectoryMap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = myholdDirectoryMap[trykey] del myholdDirectoryMap[trykey] myholdDirectoryMap[datasetName] = dmapentry firstFile = dmapentry[0][1] #end of test code #firstFile = self.parent.parent.main_frame.dirp_firstfile[selected_page] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] else: # working off-line projectName = self.parent.parent.main_frame.projectName[selected_page] if self.parent.parent.offline_file_directory[selected_page] == "directory": if self.parent.parent.config is None: extraction_controls.call_sessionmaker( self.parent.parent ) datasetPaths = [] dmap = {self.parent.parent.offline_datasetName : datasetPaths} listerSection = getOfflineLister(self.parent.parent.config, "project:%s"%projectName, None) offlineLister = self.parent.parent.config.get(listerSection, 'offline_lister_executable') lastargs = self.parent.parent.offline_directories commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for filepath, size in processIterator(offlineLister, commandArgs, filefilt=self.parent.parent.filefilt): datasetPaths.append((filepath, str(size))) datasetNames = self.parent.parent.datasetNames directoryMap = None # get the handler for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) elif self.parent.parent.offline_file_directory[selected_page] == "file": dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() # get the handlers for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) # Iterate over datasets if appendOpt: operation = UPDATE_OP else: operation = CREATE_OP datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, self.Session, self.parent.parent.aggregateDimension, operation, self.parent.parent.filefilt, initcontext, self.parent.parent.hold_offline[selected_page], properties, comment=self.comments, testProgress1=testProgress1, testProgress2=testProgress2 , handlerDictionary=self.parent.parent.handlerDictionary, extraFields=extraFields, readFiles=True) # If working on-line then replace the scanned list of datasets with # the complete list of datasets #test """ print 'datasetNames:' for t1 in datasetNames: print t1 print 'datasetNames2:' for t2 in datasetNames2: print t2 """ if not self.parent.parent.hold_offline[selected_page]: datasets = [] versionObjs = [] # ganz finally, tested datasetNames2 here for dsetName, version in datasetNames: result = Dataset.lookup(dsetName, self.Session, version=version) if result is not None: entry, versionObj = result datasets.append(entry) versionObjs.append(versionObj) # Get the summary of errors after doing a data extraction dset_error = [] for dset in datasets: status = dset.get_publication_status(self.Session) status_name = eventName[status] if dset.has_warnings(self.Session): dset_error.append(dset.get_name(self.Session)) try: list_fields = getQueryFields( handler ) except: handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields( handler ) # Display the datasets in the "Collection" page # if self.parent.parent.hold_offline[selected_page] == True: # tab_name = "Collection_Offline" # from_tab = "Collection" # pub_editorviewer = self.parent.parent.create_publisher_editor_viewer( self.parent.parent, tab_name, dataset, from_tab, self.Session) # Show the extracted datasets self.set_column_labels( len(datasets), list_fields ) self.show_extracted_info(datasets, dset_error, list_fields, versionObjs) # Enable the "Data Publication" button self.parent.ControlButton3.configure( state = 'normal' )
def getProduct(cmor_table, variable, experiment, year1, year2): """Get the DRS product value associated with the file. Returns 'output1' for datasets to be replicated, 'output2' for datasets outside the replicated datasets, 'output' if the product cannot be determined. """ cmor_table = cmor_table.lower() variable = variable.lower() # decadal1960, decadal1980, decadal2005 => decadal_30 # Other decadal experiments => decadal_10 if experiment is None and WARN: warning("Found empty experiment field") base_year = None else: if experiment[0:7]=='decadal': fullexperiment = experiment if experiment in ['decadal1960', 'decadal1980', 'decadal2005']: experiment = 'decadal_30' else: experiment = 'decadal_10' try: base_year = int(fullexperiment[7:11]) except: base_year = 0 else: base_year = None # If the variable is not in the request list, => output2 vardict = cmor_variables.get(cmor_table, None) reqdict = requested_time.get(cmor_table, None) # If the CMOR table or variable are unknown, don't even try if vardict is None or variable is None: result = 'output' # Check for variables outside the request list elif variable not in vardict: result = 'output2' # CMOR table == 'day' elif cmor_table == 'day': if variable in ['huss', 'omldamax', 'pr', 'psl', 'sfcwind', 'tas', 'tasmax', 'tasmin', 'tos', 'tossq']: result = 'output1' else: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2) # CMOR table == 'Oyr' elif cmor_table == 'oyr': priority, dimensions = vardict[variable] if priority in [1,2]: result = 'output1' else: result = 'output2' # CMOR table == 'Omon' elif cmor_table == 'omon': priority, dimensions = vardict[variable] if 'basin' in dimensions: result = 'output1' elif 'olevel' in dimensions and priority>1: result = 'output2' else: result = 'output1' # CMOR table == 'aero' elif cmor_table == 'aero': priority, dimensions = vardict[variable] if 'alevel' not in dimensions: result = 'output1' else: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2, base_year=base_year) # CMOR table == '6hrPlev', '3hr', 'cfMon', 'cfOff' elif cmor_table in ['6hrplev', '3hr', 'cfmon', 'cfoff']: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2) # Otherwise => output1 else: result = 'output1' return result
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName,versionno = datasetNames[iloop] # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s"%datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate=False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName,versionno)])==0: warning("No files specified for dataset %s, version %d."%(datasetName,versionno)) continue firstFile = dmap[(datasetName,versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator([sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key(datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name info("Using project name = %s"%projectName) if prevProject is not None and projectName!=prevProject: raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored'%name) else: context[name] = value # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset=None if testProgress1 is not None: testProgress1[1] = (100./ct)*iloop if not offline: testProgress1[2] = (100./ct)*iloop + (50./ct) else: testProgress1[2] = (100./ct)*iloop + (100./ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100./ct)*iloop + 50./ct testProgress2[2] = (100./ct)*(iloop + 1) if runAggregate: aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. """ if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = nameDict[datasetName] if (not DELETE_AT_DATASET_LEVEL) and (dset is not None): for versionObj in versionObjs: try: eventName, stateName = deleteGatewayDatasetVersion(versionObj.name, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName else: # Nothing in the node database, but still try to delete on the gateway if DELETE_AT_DATASET_LEVEL and (dset is not None) and (not restInterface): datasetName = dset.name try: eventName, stateName = deleteGatewayDatasetVersion(datasetName, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetName, string.join(fields[-2:], '\n'))) continue
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", '.*\.nc$') init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError( "Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError( "Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog) return result
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds]: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def extractFromFile(dataset, openfile, fileobj, session, handler, cfHandler, aggdimName=None, varlocate=None, exclude_variables=None, perVariable=None, **context): """ Extract metadata from a file, add to a database. dataset The dataset instance. openfile An open netCDF file object. fileobj A (logical) file instance. session A database session instance. cfHandler A CF handler instance handler Project handler aggdimName The name of the dimension which is split across files, if any. varlocate List with elements [varname, pattern]. The variable will be extracted from the file only if the filename matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']] exclude_variables List of thredds_exclude_variables perVariable Boolean, Try to find a target_variable if true and extract all variables if false context A dictionary with keys project, model, experiment, and run. """ fileVersion = fileobj.versions[-1] # Get the aggregate dimension range if aggdimName is not None and openfile.hasVariable(aggdimName): aggvarFirst = openfile.getVariable(aggdimName, index=0) aggvarLast = openfile.getVariable(aggdimName, index=-1) aggvarLen = openfile.inquireVariableShape(aggdimName)[0] aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName)) if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"): if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12: dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE) if aggdimName is not None and not openfile.hasVariable(aggdimName): info("Aggregate dimension not found: %s"%aggdimName) varlocatedict = {} if varlocate is not None: for varname, pattern in varlocate: varlocatedict[varname.strip()] = pattern.strip() # Create global attribute target_variable = None for attname in openfile.inquireAttributeList(): attvalue = openfile.getAttribute(attname, None) atttype, attlen = getTypeAndLen(attvalue) attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen) fileobj.attributes.append(attribute) if attname == 'tracking_id': fileVersion.tracking_id = attvalue # extract target_variable from global attributes if attname == 'variable_id' and perVariable: target_variable = attvalue debug('Extracted target variable from global attributes: %s' % target_variable) debug('.%s = %s' % (attname, attvalue)) # try to get target_variable from DRS if not found in global attributes if not target_variable and perVariable: config = getConfig() if config is not None: drs_pattern = handler.getFilters()[0][1:-1] drs_file_pattern = '%s/(?P<filename>[\w.-]+)$' % drs_pattern drs_parts = re.search(drs_file_pattern, openfile.path).groupdict() if 'variable' in drs_parts: target_variable = drs_parts['variable'] debug('Extracted target variable from DRS: %s' % target_variable) # target_variable must be present in the file if target_variable not in openfile.inquireVariableList(): target_variable = None # For each variable in the file: for varname in openfile.inquireVariableList(): # we need to extract only target, aggregation and coverage variables if target_variable: is_coverage_variable = check_coverage_variable(varname, openfile) if not is_coverage_variable and varname != target_variable and varname != aggdimName: debug("Skipping variable %s in %s (not target (%s), coverage or aggregation (%s) variable)" % (varname, fileVersion.location, target_variable, aggdimName)) continue varshape = openfile.inquireVariableShape(varname) debug("%s%s"%(varname, `varshape`)) # Check varlocate if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname].strip(), os.path.basename(fileVersion.location)): debug("Skipping variable %s in %s"%(varname, fileVersion.location)) continue is_target_variable = True if target_variable and target_variable != varname: is_target_variable = False elif varname in exclude_variables: is_target_variable = False # Create a file variable varstr = openfile.getAttribute('long_name', varname, None) if not varstr is None and len(varstr) > 255: varstr = varstr[0:255] filevar = FileVariable(varname, varstr, is_target_variable=is_target_variable) fileobj.file_variables.append(filevar) # Create attributes: for attname in openfile.inquireAttributeList(varname): attvalue = openfile.getAttribute(attname, varname) atttype, attlen = getTypeAndLen(attvalue) attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen) filevar.attributes.append(attribute) debug(' %s.%s = %s'%(varname, attname, `attvalue`)) # Create dimensions seq = 0 dimensionList = openfile.inquireVariableDimensions(varname) for dimname, dimlen in zip(dimensionList, varshape): dimension = FileVariableDimension(dimname, dimlen, seq) filevar.dimensions.append(dimension) if dimname==aggdimName: filevar.aggdim_first = float(aggvarFirst) filevar.aggdim_last = float(aggvarLast) filevar.aggdim_units = aggvarunits seq += 1 # Set coordinate axis range and type if applicable if len(varshape)==1: var0 = openfile.getVariable(varname, index=0) if var0 is None: continue varn = openfile.getVariable(varname, index=-1) if cfHandler.axisIsLatitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Y' elif cfHandler.axisIsLongitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'X' elif cfHandler.axisIsLevel(filevar): vararray = openfile.getVariable(varname) filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Z' filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:'+self.name dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True)) idfields = [re.findall(_patpat, format) for format in dataset_id_formats] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1]=='/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s'%drsversion except: allfields = reduce(lambda x,y: set(x)+set(y), idfields) missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section))) raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath)) else: warning("Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini") datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0 ): warning("Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini") return datasetMap
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, restInterface=False): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. """ if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d" % gatewayOperation) deleteOnGateway = (gatewayOperation == DELETE) operation = (gatewayOperation != NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName, version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName( datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s" % datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL() servicePort = config.getint('DEFAULT', 'hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL() serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName, version in datasetNames: isDataset, dset, versionObjs, isLatest = nameDict[datasetName] if (not DELETE_AT_DATASET_LEVEL) and (dset is not None): for versionObj in versionObjs: try: eventName, stateName = deleteGatewayDatasetVersion( versionObj.name, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[-2:], '\n'))) continue info(" Result: %s" % stateName) resultDict[datasetName] = eventName else: # Nothing in the node database, but still try to delete on the gateway if DELETE_AT_DATASET_LEVEL and (dset is not None) and ( not restInterface): datasetName = dset.name try: eventName, stateName = deleteGatewayDatasetVersion( datasetName, gatewayOperation, service, session, dset=dset) except RemoteCallException, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = ` e `.split('\n') error( "Deletion/retraction failed for dataset/version %s with message: %s" % (datasetName, string.join(fields[-2:], '\n'))) continue
def checkAndUpdateRepo(cmor_table_path, ds_version): """ Checks for a file written to a predefined location. if not present or too old, will pull the repo based on the input path argument and update the timestamp. """ # This is run during handler initialization and not for each file validation # Pull repo if fetched more than one day ago # or if never fetched before if os.path.exists(UPDATE_TIMESTAMP): mtime = os.path.getmtime(UPDATE_TIMESTAMP) now = time() if now - mtime > (86400.0): pull_cmor_repo = True else: pull_cmor_repo = False else: pull_cmor_repo = True if pull_cmor_repo: try: # Go into CMOR table path # Git fetch CMOR table repo # Go back to previous working directory checkedRun(('cd {} && git fetch --quiet' ).format(cmor_table_path)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Local CMOR table repository fetched or updated") except Exception as e : warning("Attempt to update the cmor table repo and encountered an error: " + str(e)) # Change repo branch in any case try: # Go into CMOR table path # Stash any changes from previous checkout # Checkout to the appropriate CMOR table tag # Go back to previous working directory checkedRun(('cd {} && git stash --quiet && git checkout {} --quiet' ).format(cmor_table_path, ds_version)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Consider CMOR table tag: {}".format(ds_version)) except Exception as e: raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error. Please contact support"%ds_version) # Get most up to date CMIP6_CV in any case if ds_version != "master": try: # Go into CMOR table path # PrePARE requires to have the most up to date CMIP6 CV. # Update CMIP6_CV.json from master branch. # Go back to previous working directory checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet' ).format(cmor_table_path)) debug("CMIP6 CV updated from master") except Exception as e: raise ESGPublishError("Master branch does not exists or CMIP6_CV.json not found or other error. Please contact support" % ds_version)
def getProduct(cmor_table, variable, experiment, year1, year2): """Get the DRS product value associated with the file. Returns 'output1' for datasets to be replicated, 'output2' for datasets outside the replicated datasets, 'output' if the product cannot be determined. """ cmor_table = cmor_table.lower() variable = variable.lower() # decadal1960, decadal1980, decadal2005 => decadal_30 # Other decadal experiments => decadal_10 if experiment is None and WARN: warning("Found empty experiment field") base_year = None else: if experiment[0:7] == 'decadal': fullexperiment = experiment if experiment in ['decadal1960', 'decadal1980', 'decadal2005']: experiment = 'decadal_30' else: experiment = 'decadal_10' try: base_year = int(fullexperiment[7:11]) except: base_year = 0 else: base_year = None # If the variable is not in the request list, => output2 vardict = cmor_variables.get(cmor_table, None) reqdict = requested_time.get(cmor_table, None) # If the CMOR table or variable are unknown, don't even try if vardict is None or variable is None: result = 'output' # Check for variables outside the request list elif variable not in vardict: result = 'output2' # CMOR table == 'day' elif cmor_table == 'day': if variable in [ 'huss', 'omldamax', 'pr', 'psl', 'sfcwind', 'tas', 'tasmax', 'tasmin', 'tos', 'tossq' ]: result = 'output1' else: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2) # CMOR table == 'Oyr' elif cmor_table == 'oyr': priority, dimensions = vardict[variable] if priority in [1, 2]: result = 'output1' else: result = 'output2' # CMOR table == 'Omon' elif cmor_table == 'omon': priority, dimensions = vardict[variable] if 'basin' in dimensions: result = 'output1' elif 'olevel' in dimensions and priority > 1: result = 'output2' else: result = 'output1' # CMOR table == 'aero' elif cmor_table == 'aero': priority, dimensions = vardict[variable] if 'alevel' not in dimensions: result = 'output1' else: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2, base_year=base_year) # CMOR table == '6hrPlev', '3hr', 'cfMon', 'cfOff' elif cmor_table in ['6hrplev', '3hr', 'cfmon', 'cfoff']: result = getTimeDependentProduct(cmor_table, variable, experiment, reqdict, year1, year2) # Otherwise => output1 else: result = 'output1' return result
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:' + self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions( 'experiment', experimentOptions) if idFormat.find( '%(experiment)s') != -1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x, y: x + '|' + y, experimentOptions) experimentPattern = r'(?P<experiment>%s)' % optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1] != '$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value != context[key]: warning("Dataset ID=%s, but %s=%s" % (datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning( "Dataset ID: %s does not match the dataset_id format(s): %s" % (datasetName, ` datasetIdFormats `)) return context
def evt_remove_dataset(self, parent): from esgcet.publish import pollDatasetPublicationStatus # Start the busy routine to indicate to the users something is happening parent.busyCursor = 'watch' parent.busyWidgets = [parent.pane2.pane( 'EditPaneTop' ), parent.pane2.pane( 'EditPaneBottom' ), parent.pane2.pane( 'EditPaneStatus' ), parent.pane.pane( 'ControlPane' )] pub_busy.busyStart( parent ) datasetNames = [] GUI_line = {} DELETE = 1 #UNPUBLISH = 2 NO_OPERATION = 3 DeleteLocalDB = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox1() # DeleteLocalDB DeleteGateway = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox2() # DeleteGateway DeleteThredds = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox3() # DeleteThredds selected_page = parent.main_frame.selected_top_page if selected_page is not None: tab_name = parent.top_notebook.getcurselection() for x in parent.main_frame.top_page_id[selected_page]: if parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon' and parent.main_frame.top_page_id2[selected_page][x].cget('bg') != 'salmon': dset_name = parent.main_frame.top_page_id2[selected_page][x].cget('text') #dsetVersionName1 = self.parent.parent.main_frame.top_page_id2v[selected_page][x].cget('text') #query_name, dset_version = parseDatasetVersionId(dsetVersionName1) """ ganz I am modifying this so that if a user selects a dataset without a version then we delete all versions of that dataset""" try: dset_version = parent.main_frame.version_label[selected_page][x].cget('text') except: dset_version = -1 #print 'Delete all versions' #dset_version = 1 if (dset_version == 'N/A' or not dset_version): dset_version = -1 # continue # not published, yet # Only delete published events status = pollDatasetPublicationStatus(dset_name, self.Session) if status == 3 or DeleteGateway or DeleteThredds or DeleteLocalDB: #datasetNames.append(generateDatasetVersionId((dset_name, dset_version))) datasetNames.append([dset_name, dset_version]) # ganz create name/version to delete else: parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off) GUI_line[ dset_name ] = x else: if parent.main_frame.top_page_id2[selected_page][x].cget('bg') == 'salmon': parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off) else: warning("%d: No pages generated for selection. Remove is only used to remove datasets from the Publisher." % logging.WARNING) # Remove dataset from the gateway, etc. if ((DeleteGateway==0 or DeleteThredds==0) and DeleteLocalDB==1) : ans = self.warn_On_Removal() if (ans == FALSE): return if DeleteGateway==1: gatewayOp = DELETE else: gatewayOp = NO_OPERATION # now decide if there is anything to do if (gatewayOp==1 or DeleteThredds==1 or DeleteLocalDB==1) : las=False thredds = (DeleteThredds==1) deleteDset = (DeleteLocalDB==1) testProgress = (parent.parent.statusbar.show, 0, 100) status_dict = deleteDatasetList(datasetNames, self.Session, gatewayOp, thredds, las, deleteDset, progressCallback=testProgress) # Show the published status try: for x in status_dict.keys(): status = status_dict[ x ] parent.main_frame.status_label[selected_page][GUI_line[x]].configure(text=pub_controls.return_status_text( status) ) except: pass pub_busy.busyEnd( parent ) # ganz refresh [if there were no exceptions] dataset list after deletions parent.pub_buttonexpansion.query_widgets.parent.parent.ntk.evt_refresh_list_of_datasets(selected_page )
def checkAndUpdateRepo(cmor_table_path, ds_version): """ Checks for a file written to a predefined location. if not present or too old, will pull the repo based on the input path argument and update the timestamp. """ # This is run during handler initialization and not for each file validation # Pull repo if fetched more than one day ago # or if never fetched before if os.path.exists(UPDATE_TIMESTAMP): mtime = os.path.getmtime(UPDATE_TIMESTAMP) now = time() if now - mtime > (86400.0): pull_cmor_repo = True else: pull_cmor_repo = False else: pull_cmor_repo = True if pull_cmor_repo: try: # Go into CMOR table path # Git fetch CMOR table repo # Go back to previous working directory checkedRun(('cd {} && git fetch --quiet').format(cmor_table_path)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Local CMOR table repository fetched or updated") except Exception as e: warning( "Attempt to update the cmor table repo and encountered an error: " + str(e)) # Change repo branch in any case try: # Go into CMOR table path # Stash any changes from previous checkout # Checkout to the appropriate CMOR table tag # Go back to previous working directory checkedRun( ('cd {} && git stash --quiet && git checkout {} --quiet').format( cmor_table_path, ds_version)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Consider CMOR table tag: {}".format(ds_version)) except Exception as e: raise ESGPublishError( "Error data_specs_version tag %s not found in the CMOR tables or other error. Please contact support" % ds_version) # Get most up to date CMIP6_CV in any case if ds_version != "master": try: # Go into CMOR table path # PrePARE requires to have the most up to date CMIP6 CV. # Update CMIP6_CV.json from master branch. # Go back to previous working directory checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet' ).format(cmor_table_path)) debug("CMIP6 CV updated from master") except Exception as e: raise ESGPublishError( "Master branch does not exists or CMIP6_CV.json not found or other error. Please contact support" % ds_version)
def readContext(self, cdfile, model=""): "Get a dictionary of keys from an open file" result = BasicHandler.readContext(self, cdfile) f = cdfile.file for key, value in cmorAttributes.items(): try: result[key] = getattr(f, value) if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray: res = str(result[key][0]) if key == "run_name": if res[0:3] != "run": res = "run" + res result[key] = res except: pass if "realization" in result and "initialization_method" in result and "physics_version" in result: ensemble = "r%si%sp%s" % (result["realization"], result["initialization_method"], result["physics_version"]) result["ensemble"] = ensemble result["run_name"] = ensemble base = os.path.basename(cdfile.path) try: index = base.index("_") varname = base[0:index] result["variable"] = varname except: warning("File path must have the form varname_XXX: %s" % cdfile.path) #!WARNING: All IPSL-LUCID data goes into output result["product"] = "output" self.mapEnumeratedValues(result) # If realm has multiple fields, pick the first one if "realm" in result: realm = result["realm"].strip() if realm.find(" ") != -1: realms = realm.split(" ") result["realm"] = realms[0] # Parse CMOR table. if "table_id" in result: tableId = result["table_id"] fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields) > 1 and (fields[1] in cmorTables): table = fields[1] result["cmor_table"] = table else: result["cmor_table"] = "noTable" else: result["cmor_table"] = "noTable" # Cache a 'drs_id' attribute for DRS-style dataset lookups validateDRSFieldValues(result, cdfile) if ( "product" in result and "institute" in result and "model" in result and "experiment" in result and "time_frequency" in result and "realm" in result and "cmor_table" in result and "ensemble" in result ): drsid = "%s.%s.%s.%s.%s.%s.%s.%s.%s" % ( DRS_ACTIVITY, result["product"], result["institute"], result["model"], result["experiment"], result["time_frequency"], result["realm"], result["cmor_table"], result["ensemble"], ) result["drs_id"] = drsid return result
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
'cf-standard-name-table.xml') except: raise ESGPublishError("No standard name table specified.") try: tree = parse(path) except Exception, e: raise ESGPublishError("Error parsing %s: %s" % (path, e)) root = tree.getroot() standardNames = {} for node in root: if node.tag == 'entry': name = node.attrib['id'].strip() if len(name) > MAX_STANDARD_NAME_LENGTH: warning( "Standard_name is too long. Schema requires standard_name to be <= %d characters\n %s" % (MAX_STANDARD_NAME_LENGTH, name)) continue units = amip = grib = description = '' for subnode in node: if subnode.tag == 'canonical_units': units = subnode.text.strip() elif subnode.tag == 'amip': amip = subnode.text elif subnode.tag == 'grib': grib = subnode.text elif subnode.tag == 'description': description = subnode.text else: raise ESGPublishError(
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", ".*\.nc$") init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError("Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName ) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName ) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets( projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, ) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog, ) return result
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # list of all target variables of a dataset dset_target_vars = set() # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: if filevar.is_target_variable: dset_target_vars.add(filevar.short_name) # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
from pkg_resources import resource_filename path = resource_filename('esgcet.config.etc', 'cf-standard-name-table.xml') except: raise ESGPublishError("No standard name table specified.") try: tree = parse(path) except Exception, e: raise ESGPublishError("Error parsing %s: %s"%(path, e)) root = tree.getroot() standardNames = {} for node in root: if node.tag == 'entry': name = node.attrib['id'].strip() if len(name) > MAX_STANDARD_NAME_LENGTH: warning("Standard_name is too long. Schema requires standard_name to be <= %d characters\n %s"%(MAX_STANDARD_NAME_LENGTH, name)) continue units = amip = grib = description = '' for subnode in node: if subnode.tag == 'canonical_units': units = subnode.text.strip() elif subnode.tag == 'amip': amip = subnode.text elif subnode.tag == 'grib': grib = subnode.text elif subnode.tag == 'description': description = subnode.text else: raise ESGPublishError("Invalid standard name table tag: %s"%subnode.tag)
def readContext(self, cdfile, model=''): "Get a dictionary of keys from an open file" result = BasicHandler.readContext(self, cdfile) f = cdfile.file for key, value in cmorAttributes.items(): try: result[key] = getattr(f, value) if key in cmorArrayAttributes and type(result[key]) is numpy.ndarray: res = str(result[key][0]) if key=='run_name': if res[0:3]!='run': res = 'run'+res result[key] = res except: pass if 'realization' in result and 'initialization_method' in result and 'physics_version' in result: ensemble = 'r%si%sp%s'%(result['realization'], result['initialization_method'], result['physics_version']) result['ensemble'] = ensemble result['run_name'] = ensemble base = os.path.basename(cdfile.path) try: index = base.index('_') varname = base[0:index] result['variable'] = varname except: warning("File path must have the form varname_XXX: %s"%cdfile.path) #!WARNING: I think all TAMIP2 data goes into output1 result['product'] = 'output1' self.mapEnumeratedValues(result) # If realm has multiple fields, pick the first one if 'realm' in result: realm = result['realm'].strip() if realm.find(' ')!=-1: realms = realm.split(' ') result['realm'] = realms[0] # Parse CMOR table. if 'table_id' in result: tableId = result['table_id'] fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields)>1 and (fields[1] in cmorTables): table = fields[1] result['cmor_table'] = table else: result['cmor_table'] = 'noTable' else: result['cmor_table'] = 'noTable' # Cache a 'drs_id' attribute for DRS-style dataset lookups validateDRSFieldValues(result, cdfile) if 'product' in result and 'institute' in result and 'model' in result and 'experiment' in result and 'time_frequency' in result and 'realm' in result and 'cmor_table' in result and 'ensemble' in result: drsid = '%s.%s.%s.%s.%s.%s.%s.%s.%s'%(DRS_ACTIVITY, result['product'], result['institute'], result['model'], result['experiment'], result['time_frequency'], result['realm'], result['cmor_table'], result['ensemble']) result['drs_id'] = drsid return result
def evt_remove_dataset(self, parent): from esgcet.publish import pollDatasetPublicationStatus # Start the busy routine to indicate to the users something is happening parent.busyCursor = 'watch' parent.busyWidgets = [ parent.pane2.pane('EditPaneTop'), parent.pane2.pane('EditPaneBottom'), parent.pane2.pane('EditPaneStatus'), parent.pane.pane('ControlPane') ] pub_busy.busyStart(parent) datasetNames = [] GUI_line = {} DELETE = 1 #UNPUBLISH = 2 NO_OPERATION = 3 DeleteLocalDB = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox1( ) # DeleteLocalDB DeleteGateway = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox2( ) # DeleteGateway DeleteThredds = pub_expand_deletion_control_gui.deletion_widgets.get_CheckBox3( ) # DeleteThredds selected_page = parent.main_frame.selected_top_page if selected_page is not None: tab_name = parent.top_notebook.getcurselection() for x in parent.main_frame.top_page_id[selected_page]: if parent.main_frame.top_page_id[selected_page][x].cget( 'bg') != 'salmon' and parent.main_frame.top_page_id2[ selected_page][x].cget('bg') != 'salmon': dset_name = parent.main_frame.top_page_id2[selected_page][ x].cget('text') #dsetVersionName1 = self.parent.parent.main_frame.top_page_id2v[selected_page][x].cget('text') #query_name, dset_version = parseDatasetVersionId(dsetVersionName1) """ ganz I am modifying this so that if a user selects a dataset without a version then we delete all versions of that dataset""" try: dset_version = parent.main_frame.version_label[ selected_page][x].cget('text') except: dset_version = -1 #print 'Delete all versions' #dset_version = 1 if (dset_version == 'N/A' or not dset_version): dset_version = -1 # continue # not published, yet # Only delete published events status = pollDatasetPublicationStatus( dset_name, self.Session) if status == 3 or DeleteGateway or DeleteThredds or DeleteLocalDB: #datasetNames.append(generateDatasetVersionId((dset_name, dset_version))) datasetNames.append([ dset_name, dset_version ]) # ganz create name/version to delete else: parent.main_frame.top_page_id[selected_page][ x].configure(relief='raised', background='salmon', image=self.off) GUI_line[dset_name] = x else: if parent.main_frame.top_page_id2[selected_page][x].cget( 'bg') == 'salmon': parent.main_frame.top_page_id[selected_page][ x].configure(relief='raised', background='salmon', image=self.off) else: warning( "%d: No pages generated for selection. Remove is only used to remove datasets from the Publisher." % logging.WARNING) # Remove dataset from the gateway, etc. if ((DeleteGateway == 0 or DeleteThredds == 0) and DeleteLocalDB == 1): ans = self.warn_On_Removal() if (ans == FALSE): return if DeleteGateway == 1: gatewayOp = DELETE else: gatewayOp = NO_OPERATION # now decide if there is anything to do if (gatewayOp == 1 or DeleteThredds == 1 or DeleteLocalDB == 1): las = False thredds = (DeleteThredds == 1) deleteDset = (DeleteLocalDB == 1) testProgress = (parent.parent.statusbar.show, 0, 100) status_dict = deleteDatasetList(datasetNames, self.Session, gatewayOp, thredds, las, deleteDset, progressCallback=testProgress) # Show the published status try: for x in status_dict.keys(): status = status_dict[x] parent.main_frame.status_label[selected_page][ GUI_line[x]].configure( text=pub_controls.return_status_text(status)) except: pass pub_busy.busyEnd(parent) # ganz refresh [if there were no exceptions] dataset list after deletions parent.pub_buttonexpansion.query_widgets.parent.parent.ntk.evt_refresh_list_of_datasets( selected_page)
def __invoke(self, method, params): # call a method on the remote server request = HessianWriter().write_call(method, params) # ---------------------------------------------------------------------- # Patch for HTTP proxy support starts here. [email protected] # import httplib, os, urlparse, ssl if self._scheme == "http": proxy_url = os.environ.get('http_proxy') if proxy_url is not None: if DEBUG: messaging.info('Proxy detected at %s' % proxy_url) proxy_parts = urlparse.urlparse(proxy_url) proxy_host = proxy_parts.hostname proxy_port = proxy_parts.port if proxy_port is None: proxy_port = 80 h = httplib.HTTPConnection(proxy_host, port=proxy_port) else: h = httplib.HTTPConnection(self._host, port=self._port) else: ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1) conn_args = {'port' : self._port, 'key_file' : self._key_file, 'cert_file': self._cert_file, 'context': ctx} h = httplib.HTTPSConnection(self._host, **conn_args) # test the connection - may need unverified with test index nodes # (hopefully not with operational nodes) try: h.request("HEAD", "/") h.getresponse() except ssl.SSLError: messaging.warning('SSL error - disabling SSL verification') conn_args['context'] = ssl._create_unverified_context() h = httplib.HTTPSConnection(self._host, **conn_args) req_headers = {'Host': self._host, 'User-Agent': "hessianlib.py/%s" % __version__, 'Content-Length': str(len(request)), } if DEBUG: messaging.info('Sending request: %s' % `request`) h.request("POST", self._url, request, req_headers) # # End Patch from [email protected] # ---------------------------------------------------------------------- response = h.getresponse() headers = response.getheaders() errcode = response.status errmsg = response.reason # errcode, errmsg, headers = h.getreply() if errcode != 200: raise ProtocolError(self._url, errcode, errmsg, headers) # return self.parse_response(h.getfile()) if DEBUG: messaging.info('Got response:') responseProxy = ResponseProxy(response) return self.parse_response(responseProxy)
def readContext(self, cdfile, model=''): "Get a dictionary of keys from an open file" result = BasicHandler.readContext(self, cdfile) f = cdfile.file for key, value in cmorAttributes.items(): try: result[key] = getattr(f, value) if key in cmorArrayAttributes and type( result[key]) is numpy.ndarray: res = str(result[key][0]) if key == 'run_name': if res[0:3] != 'run': res = 'run' + res result[key] = res except: pass if 'realization' in result and 'initialization_method' in result and 'physics_version' in result: ensemble = 'r%si%sp%s' % (result['realization'], result['initialization_method'], result['physics_version']) result['ensemble'] = ensemble result['run_name'] = ensemble base = os.path.basename(cdfile.path) try: index = base.index('_') varname = base[0:index] result['variable'] = varname except: warning("File path must have the form varname_XXX: %s" % cdfile.path) #!WARNING: I think all TAMIP2 data goes into output1 result['product'] = 'output1' self.mapEnumeratedValues(result) # If realm has multiple fields, pick the first one if 'realm' in result: realm = result['realm'].strip() if realm.find(' ') != -1: realms = realm.split(' ') result['realm'] = realms[0] # Parse CMOR table. if 'table_id' in result: tableId = result['table_id'] fields = tableId.split() # Assume table ID has the form 'Table table_id ...' if len(fields) > 1 and (fields[1] in cmorTables): table = fields[1] result['cmor_table'] = table else: result['cmor_table'] = 'noTable' else: result['cmor_table'] = 'noTable' # Cache a 'drs_id' attribute for DRS-style dataset lookups validateDRSFieldValues(result, cdfile) if 'product' in result and 'institute' in result and 'model' in result and 'experiment' in result and 'time_frequency' in result and 'realm' in result and 'cmor_table' in result and 'ensemble' in result: drsid = '%s.%s.%s.%s.%s.%s.%s.%s.%s' % ( DRS_ACTIVITY, result['product'], result['institute'], result['model'], result['experiment'], result['time_frequency'], result['realm'], result['cmor_table'], result['ensemble']) result['drs_id'] = drsid return result
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False, nodbwrite=False, pid_connector=None): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. pid_connector esgfpid.Connector object to register PIDs """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName, versionno = datasetNames[iloop] # Must specify version for replications if masterGateway: if not newVersion and versionno < 0: raise ESGPublishError( "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list." ) # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s" % datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate = False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName, versionno)]) == 0: warning("No files specified for dataset %s, version %d." % (datasetName, versionno)) continue firstFile = dmap[(datasetName, versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator( [direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator( [sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key( datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name info("Using project name = %s" % projectName) if prevProject is not None and projectName != prevProject: raise ESGPublishError( "Multiple projects found: %s, %s. Can only publish from one project" % (prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored' % name) else: context[name] = value # add dataset_version to context to allow version to be a mandatory field if versionno > -1: context['dataset_version'] = versionno elif newVersion is not None: context['dataset_version'] = newVersion # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset = None if testProgress1 is not None: testProgress1[1] = (100. / ct) * iloop if not offline: testProgress1[2] = (100. / ct) * iloop + (50. / ct) else: testProgress1[2] = (100. / ct) * iloop + (100. / ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, perVariable=perVariable, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, nodbwrite=nodbwrite, pid_connector=pid_connector, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100. / ct) * iloop + 50. / ct testProgress2[2] = (100. / ct) * (iloop + 1) if runAggregate and (not nodbwrite): aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later if (not nodbwrite): handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def datasetOrVersionName(name, version, session, deleteAll=False, restInterface=False): """ Determine if the name refers to a dataset or dataset version. Returns (deleteAll, datasetObj, [versionObjs], isLatestVersion) where: datasetObj is the related dataset object, or None if neither the dataset or version is found; [versionObj] is a list of version objects to be deleted. isLatestVersion is True iff this version is the latest one for the dataset. It is not considered an error if the version does not exist in the local database, since it may still exist in THREDDS and/or the gateway. name String name to look up. session A database Session **instance**. version Version to delete. If version is -1, all version objects for the dataset are returned. deleteAll Boolean, if True delete all versions of the dataset(s). restInterface Boolean, if True then name has the form 'master_id.version|data_node'. """ # Parse a SOLR dataset ID if the RESTful interface is used if restInterface: saveName = name name, version, data_node = parseSolrDatasetId(name) if data_node is None: warning("Dataset: %s, REST interface dataset identifiers should have the form dataset_id|data_node"%saveName) # Lookup the dataset dset = session.query(Dataset).filter_by(name=name).first() deleteAll = (deleteAll or version==-1) isLatest = False if dset is None: dsetVersionObjs = [] else: # It's a dataset # Check if this is the latest version versionObj = dset.getVersionObj(version=version) if versionObj is None: warning("Version %d of dataset %s not found"%(version, dset.name)) isLatest = False else: isLatest = versionObj.isLatest() # If this is the only version, delete the entire dataset deleteAll = deleteAll or (versionObj is not None and len(dset.versions)==1) if deleteAll: dsetVersionObjs = dset.versions else: if versionObj is None: dsetVersionObjs = [] else: dsetVersionObjs = [versionObj] return (deleteAll, dset, dsetVersionObjs, isLatest)
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: service_certs_location = getServiceCertsLoc() serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, service_certs_location, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: if service.service_type == 'REST': error('Cannot unpublish multiple versions using REST. Please specify a single dataset version ("dataset_id#1"). Skipping %s' % datasetName) continue datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:'+self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions) if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions) experimentPattern = r'(?P<experiment>%s)'%optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1]!='$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value!=context[key]: warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`)) return context
def deleteDatasetList(datasetNames, Session, gatewayOperation=UNPUBLISH, thredds=True, las=False, deleteInDatabase=False, progressCallback=None, deleteAll=False, republish=False, reinitThredds=True, restInterface=False, pid_connector=None, project_config_section=None, data_node=None): """ Delete or retract a list of datasets: - Delete the dataset from the gateway. - Remove the catalogs from the THREDDS catalog (optional). - Reinitialize the LAS server and THREDDS server. - Delete the database entry (optional). if republish is False: Returns a status dictionary: datasetName => status else Returns a tuple (status_dictionary, republishList), where republishList is a list of (dataset_name, version) tuples to be republished. datasetNames A list of )dataset_name, version) tuples. Session A database Session. gatewayOperation An enumeration. If: - publish.DELETE: Remove all metadata from the gateway database. - publish.UNPUBLISH: (Default) Remove metadata that allows dataset discovery from the gateway. - publish.NO_OPERATION: No gateway delete/retract operation is called. thredds Boolean flag: if true (the default), delete the associated THREDDS catalog and reinitialize server. las Boolean flag: if true (the default), reinitialize server. deleteInDatabase Boolean flag: if true (default is False), delete the database entry. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. deleteAll Boolean, if True delete all versions of the dataset(s). republish Boolean, if True return (statusDictionary, republishList), where republishList is a list of datasets to be republished. reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) data_node String, the datanode to unpublish (only for unpublication from Solr) """ if gatewayOperation == UNINITIALIZED: raise ESGPublishError("Need to set mandatory --delete|--retract|--skip-index argument!") if gatewayOperation not in (DELETE, UNPUBLISH, NO_OPERATION): raise ESGPublishError("Invalid gateway operation: %d"%gatewayOperation) deleteOnGateway = (gatewayOperation==DELETE) operation = (gatewayOperation!=NO_OPERATION) session = Session() resultDict = {} config = getConfig() # Check the dataset names and cache the results for the gateway, thredds, and database phases nameDict = {} for datasetName,version in datasetNames: isDataset, dset, versionObjs, isLatest = datasetOrVersionName(datasetName, version, session, deleteAll=deleteAll, restInterface=restInterface) if dset is None: warning("Dataset not found in node database: %s"%datasetName) nameDict[datasetName] = (isDataset, dset, versionObjs, isLatest) # Delete the dataset from the gateway. if operation: # Create the web service proxy threddsRootURL = config.get('DEFAULT', 'thredds_url') serviceCertfile = config.get('DEFAULT', 'hessian_service_certfile') serviceKeyfile = config.get('DEFAULT', 'hessian_service_keyfile') if not restInterface: serviceURL = getHessianServiceURL(project_config_section=project_config_section) servicePort = config.getint('DEFAULT','hessian_service_port') serviceDebug = config.getboolean('DEFAULT', 'hessian_service_debug') service = Hessian(serviceURL, servicePort, key_file=serviceKeyfile, cert_file=serviceCertfile, debug=serviceDebug) else: serviceURL = getRestServiceURL(project_config_section=project_config_section) serviceDebug = config.getboolean('DEFAULT', 'rest_service_debug', default=False) service = RestPublicationService(serviceURL, serviceCertfile, keyFile=serviceKeyfile, debug=serviceDebug) for datasetName,version in datasetNames: if version > -1: datasetToUnpublish = '%s.v%s' % (datasetName, version) else: datasetToUnpublish = datasetName isDataset, dset, versionObjs, isLatest = nameDict[datasetName] try: eventName, stateName = deleteGatewayDatasetVersion(datasetToUnpublish, gatewayOperation, service, session, dset=dset, data_node=data_node) except RemoteCallException, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[0:2], '\n'))) continue except ESGPublishError, e: fields = `e`.split('\n') error("Deletion/retraction failed for dataset/version %s with message: %s"%(datasetToUnpublish, string.join(fields[-2:], '\n'))) continue info(" Result: %s"%stateName) resultDict[datasetName] = eventName
def start_harvest( self, parent ): from esgcet.publish import publishDatasetList from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue', 0.8 ) # Make sure the publisher is logged in # if not self.parent.parent.password_flg: # self.parent.parent.menu.login_menu.evt_login( self.parent.parent ) # Start the busy routine to indicate to the users something is happening self.parent.parent.busyCursor = 'watch' self.parent.parent.busyWidgets = [self.parent.parent.pane2.pane( 'EditPaneTop' ), self.parent.parent.pane2.pane( 'EditPaneBottom' ), self.parent.parent.pane2.pane( 'EditPaneStatus' ), self.parent.parent.pane.pane( 'ControlPane' )] pub_busy.busyStart( self.parent.parent ) try: # Generate the list of datasets to be published datasetNames=[] GUI_line = {} tab_name = self.parent.parent.top_notebook.getcurselection() selected_page = self.parent.parent.main_frame.selected_top_page if (selected_page is None): warning("Must generate a list of datasets to scan before publishing can occur.") pub_busy.busyEnd( self.parent.parent ) return for x in self.parent.parent.main_frame.top_page_id[selected_page]: if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon' and self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') != 'salmon': dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') ####################################### # ganz added this 1/18/11 versionNum = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') dsetTuple = (dset_name, versionNum) #dsetName = generateDatasetVersionId(dsetTuple) ##################################################################################### # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary datasetNames.append(dsetTuple) GUI_line[ dset_name ] = x else: if self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') == 'salmon': self.parent.parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off) # Publish collection of datasets testProgress = (self.parent.parent.statusbar.show, 0, 100) publishThredds = (quality_control_widgets.get_CheckBox3()==1) publishGateway = (quality_control_widgets.get_CheckBox2()==1) if (publishThredds): print 'publishing to Thredds' if (publishGateway): print 'publishing to Gateway' status_dict = publishDatasetList(datasetNames, self.Session, publish=publishGateway, thredds=publishThredds, progressCallback=testProgress) # Show the published status for x in status_dict.keys(): status = status_dict[ x ] dsetName, versionNo = x dsetVersionName = generateDatasetVersionId(x) guiLine = GUI_line[dsetName] # dsetVersionName] self.parent.parent.main_frame.status_label[selected_page][guiLine].configure(text=pub_controls.return_status_text( status) ) dset = Dataset.lookup(dsetName, self.Session) if dset.has_warnings(self.Session): warningLevel = dset.get_max_warning_level(self.Session) if warningLevel>=ERROR_LEVEL: buttonColor = "pink" buttonText = "Error" else: buttonColor = "yellow" buttonText = "Warning" self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure( text = buttonText, bg = buttonColor, relief = 'raised', command = pub_controls.Command( self.parent.parent.pub_buttonexpansion.extraction_widgets.error_extraction_button, dset ) ) else: self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure( text = 'Ok', bg = dcolor1, highlightcolor = dcolor1, relief = 'sunken', ) except: pub_busy.busyEnd( self.parent.parent ) # catch here in order to turn off the busy cursor ganz raise finally: pub_busy.busyEnd( self.parent.parent ) self.my_refresh()