def preInitialization(self): pollInterval = self.config.PhEDExInjector.pollInterval subInterval = self.config.PhEDExInjector.subscribeInterval logging.info("Setting poll interval to %s seconds for inject", pollInterval) # retrieving the node mappings is fickle and can fail quite often # hence only do it once (with retries) and pass it to the workers phedex = PhEDEx({"endpoint": self.config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(2) try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(4) nodeMappings = phedex.getNodeMap() myThread = threading.currentThread() myThread.workerThreadManager.addWorker( PhEDExInjectorPoller(self.config, phedex, nodeMappings), pollInterval) if getattr(self.config.PhEDExInjector, "subscribeDatasets", False): # wait a bit for first poll cycle of PhEDExInjectorPoller to complete # hopefully avoids intermingled logs (which can be confusing) time.sleep(2) logging.info("Setting poll interval to %s seconds for subscribe", subInterval) myThread.workerThreadManager.addWorker( PhEDExInjectorSubscriber(self.config, phedex, nodeMappings), subInterval)
def preInitialization(self): pollInterval = self.config.PhEDExInjector.pollInterval subInterval = self.config.PhEDExInjector.subscribeInterval logging.info("Setting poll interval to %s seconds for inject", pollInterval) # retrieving the node mappings is fickle and can fail quite often # hence only do it once (with retries) and pass it to the workers phedex = PhEDEx({"endpoint": self.config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(2) try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(4) nodeMappings = phedex.getNodeMap() myThread = threading.currentThread() myThread.workerThreadManager.addWorker(PhEDExInjectorPoller(self.config, phedex, nodeMappings), pollInterval) if getattr(self.config.PhEDExInjector, "subscribeDatasets", False): # wait a bit for first poll cycle of PhEDExInjectorPoller to complete # hopefully avoids intermingled logs (which can be confusing) time.sleep(2) logging.info("Setting poll interval to %s seconds for subscribe", subInterval) myThread.workerThreadManager.addWorker(PhEDExInjectorSubscriber(self.config, phedex, nodeMappings), subInterval)
def testSlcPhedexNodesEqualPhedexApiNodes(self): """ For each site, verify that the stageout node specified in site-local-config.xml is the same as the one returned by the PhEDEx api. """ os.environ["CMS_PATH"] = "/cvmfs/cms.cern.ch" phedex = PhEDEx() nodes = phedex.getNodeMap()["phedex"]["node"] # Make a dict for translating the se names into regular site names. node_map = {} for node in nodes: node_map[str(node[u"se"])] = str(node[str(u"name")]) for d in os.listdir("/cvmfs/cms.cern.ch/SITECONF/"): # Only T0_, T1_... folders are needed if d[0] == "T": os.environ[ 'WMAGENT_SITE_CONFIG_OVERRIDE'] = '/cvmfs/cms.cern.ch/SITECONF/%s/JobConfig/site-local-config.xml' % ( d) try: slc = loadSiteLocalConfig() except SiteConfigError as e: print e.args[0] phedexNode = slc.localStageOut.get("phedex-node") # If slc is correct, perform check if "se-name" in slc.localStageOut and slc.localStageOut[ "se-name"] in node_map and phedexNode != None: self.assertEqual(phedexNode, node_map[slc.localStageOut["se-name"]], \ "Error: Node specified in SLC (%s) doesn't match node returned by PhEDEx api (%s)." \ % (phedexNode, node_map[slc.localStageOut["se-name"]])) return
def keepOnlyDisks(self, locationsMap): phedex = PhEDEx() # TODO use certs from the config! # get all the PNNs that are of kind 'Disk' try: diskLocations = set([ pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind'] == 'Disk' ]) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves diskLocationsMap = {} for block, locations in locationsMap.iteritems(): locations[:] = [ x for x in locations if x != 'T3_CH_CERN_OpenData' ] # ignore OpenData until it is accessible by CRAB if set(locations) & diskLocations: # at least some locations are disk diskLocationsMap[block] = locationsMap[block] else: # no locations are in the disk list, assume that they are tape self.tapeLocations = self.tapeLocations.union( set(locations) - diskLocations) locationsMap.clear() # remove all blocks locationsMap.update( diskLocationsMap) # add only blocks with disk locations
def testSlcPhedexNodesEqualPhedexApiNodes(self): """ For each site, verify that the stageout node specified in site-local-config.xml is the same as the one returned by the PhEDEx api. """ os.environ["CMS_PATH"] = "/cvmfs/cms.cern.ch" phedex = PhEDEx() nodes = [ node[u'name'] for node in phedex.getNodeMap()["phedex"]["node"] ] for d in os.listdir("/cvmfs/cms.cern.ch/SITECONF/"): # Only T0_, T1_... folders are needed if d[0] == "T": os.environ[ 'WMAGENT_SITE_CONFIG_OVERRIDE'] = '/cvmfs/cms.cern.ch/SITECONF/%s/JobConfig/site-local-config.xml' % ( d) try: slc = loadSiteLocalConfig() except SiteConfigError as e: print(e.args[0]) phedexNode = slc.localStageOut.get("phedex-node") self.assertTrue( phedexNode in nodes, "Error: Node specified in SLC (%s) not in list returned by PhEDEx api" % phedexNode) return
def testSlcPhedexNodesEqualPhedexApiNodes(self): """ For each site, verify that the stageout node specified in site-local-config.xml is the same as the one returned by the PhEDEx api. """ os.environ["CMS_PATH"] = "/cvmfs/cms.cern.ch" phedex = PhEDEx() nodes = phedex.getNodeMap()["phedex"]["node"] # Make a dict for translating the se names into regular site names. node_map = {} for node in nodes: node_map[str(node[u"se"])] = str(node[str(u"name")]) for d in os.listdir("/cvmfs/cms.cern.ch/SITECONF/"): # Only T0_, T1_... folders are needed if d[0] == "T": os.environ['WMAGENT_SITE_CONFIG_OVERRIDE'] ='/cvmfs/cms.cern.ch/SITECONF/%s/JobConfig/site-local-config.xml' % (d) try: slc = loadSiteLocalConfig() except SiteConfigError as e: print e.args[0] phedexNode = slc.localStageOut.get("phedex-node") # If slc is correct, perform check if "se-name" in slc.localStageOut and slc.localStageOut["se-name"] in node_map and phedexNode != None: self.assertEqual(phedexNode, node_map[slc.localStageOut["se-name"]], \ "Error: Node specified in SLC (%s) doesn't match node returned by PhEDEx api (%s)." \ % (phedexNode, node_map[slc.localStageOut["se-name"]])) return
def keepOnlyDisks(self, locationsMap): phedex = PhEDEx() #TODO use certs from the config! #get all the PNN that are of kind disk try: diskLocations = set([pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind']=='Disk']) except Exception, ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
def keepOnlyDisks(self, locationsMap): phedex = PhEDEx() # TODO use certs from the config! # get all the PNNs that are of kind 'Disk' try: diskLocations = set([ pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind'] == 'Disk' ]) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves for block, locations in locationsMap.iteritems(): locationsMap[block] = set(locations) & diskLocations self.otherLocations = self.otherLocations.union( set(locations) - diskLocations)
def phedexIt(): x = PhEDEx(responseType = "json") phedexNodes = x.getNodeMap()['phedex']['node'] phedexMap = {} sePhedexMap = {} knownPhedexNodes = set() for node in phedexNodes: phedexMap[node['name']] = node['kind'] #print '%s -> %s, %s' % (node['name'], node['kind'], node['se']) if node['se'] not in sePhedexMap: sePhedexMap[node['se']] = set() sePhedexMap[node['se']].add(node['name']) knownPhedexNodes.add(node['name']) y = SiteDBJSON() seNames = y.getAllSENames() cmsNamesMap = {} for se in seNames: cmsNames = y.seToCMSName(se) cmsNamesMap[se] = cmsNames seToNodeMap = {} for se in cmsNamesMap: candidates = set() for cmsName in cmsNamesMap[se]: phedexNodes = y.cmsNametoPhEDExNode(cmsName) candidates.update(set(phedexNodes)) validCandidates = set() for candidate in candidates: if candidate in knownPhedexNodes: validCandidates.add(candidate) seToNodeMap[se] = validCandidates #print '%s to %s' % (se, candidates) for se in sePhedexMap: if se not in seToNodeMap: print "SE: %s is not in new mapping for sites %s" % (se, list(sePhedexMap[se])) for se in seToNodeMap: if se not in sePhedexMap: print "SE: %s is not in old mapping for sites %s" % (se, list(seToNodeMap[se])) continue for se in set(seToNodeMap.keys()).intersection(set(sePhedexMap.keys())): diff = sePhedexMap[se] - seToNodeMap[se] if diff: print "%s are in old mapping but not in new for %s" %(str(list(diff)), se) diff = seToNodeMap[se] - sePhedexMap[se] if diff: print "%s are in new mapping but not in old for %s" %(str(list(diff)), se)
def keepOnlyDisks(self, locationsMap): self.otherLocations = set() phedex = PhEDEx() #TODO use certs from the config! #get all the PNN that are of kind disk try: diskLocations = set([pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind']=='Disk']) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves for block, locations in locationsMap.iteritems(): locationsMap[block] = set(locations) & diskLocations self.otherLocations = self.otherLocations.union(set(locations) - diskLocations) #remove any key with value that has set([]) for key, value in locationsMap.items(): #wont work in python3! if value == set([]): locationsMap.pop(key)
def testSlcPhedexNodesEqualPhedexApiNodes(self): """ For each site, verify that the stageout node specified in site-local-config.xml is the same as the one returned by the PhEDEx api. """ os.environ["CMS_PATH"] = "/cvmfs/cms.cern.ch" phedex = PhEDEx() nodes = [node[u'name'] for node in phedex.getNodeMap()["phedex"]["node"]] for d in os.listdir("/cvmfs/cms.cern.ch/SITECONF/"): # Only T0_, T1_... folders are needed if d[0] == "T": os.environ['WMAGENT_SITE_CONFIG_OVERRIDE'] ='/cvmfs/cms.cern.ch/SITECONF/%s/JobConfig/site-local-config.xml' % (d) try: slc = loadSiteLocalConfig() except SiteConfigError as e: print(e.args[0]) phedexNode = slc.localStageOut.get("phedex-node") self.assertTrue(phedexNode in nodes, "Error: Node specified in SLC (%s) not in list returned by PhEDEx api" % phedexNode) return
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.siteDB = SiteDBJSON() self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") self.safeMode = getattr(config.PhEDExInjector, "safeOperationMode", False) self.replicaOnly = getattr(config.PhEDExInjector, "replicaOnly", False) # Subscribed state in the DBSBuffer table for datasets self.terminalSubscriptionState = 1 if self.safeMode: self.terminalSubscriptionState = 2 # We will map node names to CMS names, that what the spec will have. # If a CMS name is associated to many PhEDEx node then choose the MSS option self.cmsToPhedexMap = {} # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.PhEDExInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUnsubscribed = daofactory(classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") self.getPartiallySubscribed = daofactory( classname="GetPartiallySubscribedDatasets") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: cmsName = self.siteDB.phEDExNodetocmsName(node["name"]) if cmsName not in self.cmsToPhedexMap: self.cmsToPhedexMap[cmsName] = {} logging.info("Loaded PhEDEx node %s for site %s" % (node["name"], cmsName)) if node["kind"] not in self.cmsToPhedexMap[cmsName]: self.cmsToPhedexMap[cmsName][node["kind"]] = node["name"] return def algorithm(self, parameters): """ _algorithm_ Poll the database for datasets and subscribe them. """ myThread = threading.currentThread() myThread.transaction.begin() # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute( conn=myThread.transaction.conn, transaction=True) if self.safeMode: partiallySubscribedDatasets = self.getPartiallySubscribed.execute( conn=myThread.transaction.conn, transaction=True) unsubscribedDatasets.extend(partiallySubscribedDatasets) partiallySubscribedSet = set() for entry in partiallySubscribedDatasets: partiallySubscribedSet.add(entry["path"]) # Map the datasets to their specs specDatasetMap = {} for unsubscribedDataset in unsubscribedDatasets: datasetPath = unsubscribedDataset["path"] workflow = unsubscribedDataset["workflow"] spec = unsubscribedDataset["spec"] if datasetPath not in specDatasetMap: specDatasetMap[datasetPath] = [] specDatasetMap[datasetPath].append({ "workflow": workflow, "spec": spec }) specCache = {} siteMap = {} # Distribute the subscriptions by site, type and priority # This is to make as few subscriptions as possible # Site map values are dictionaries where the keys are tuples (Prio, Custodial, AutoApprove, Move) # Where Custodial is boolean, Prio is in ["Low", "Normal", "High"], AutoApprove is boolean and Move is boolean for dataset in specDatasetMap: # Aggregate all the different subscription configurations subInfo = {} for entry in specDatasetMap[dataset]: if not entry["spec"]: # Can't use this spec, there isn't one continue # Load spec if not in the cache if entry["spec"] not in specCache: helper = WMWorkloadHelper() try: helper.load(entry["spec"]) specCache[entry["spec"]] = helper except Exception: #Couldn't load it , alert and carry on msg = "Couldn't load spec: %s" % entry["spec"] logging.error(msg) self.sendAlert(7, msg=msg) continue #If we are running in safe mode, we need to know if the workflow is ready # We have the spec, get the info helper = specCache[entry["spec"]] workflowSubInfo = helper.getSubscriptionInformation() datasetSubInfo = workflowSubInfo.get(dataset, None) if datasetSubInfo and subInfo: subInfo["CustodialSites"] = extendWithoutDups( subInfo["CustodialSites"], datasetSubInfo["CustodialSites"]) subInfo["NonCustodialSites"] = extendWithoutDups( subInfo["NonCustodialSites"], datasetSubInfo["NonCustodialSites"]) subInfo["AutoApproveSites"] = extendWithoutDups( subInfo["AutoApproveSites"], datasetSubInfo["AutoApproveSites"]) subInfo["Priority"] = solvePrioConflicts( subInfo["Priority"], datasetSubInfo["Priority"]) elif datasetSubInfo: subInfo = datasetSubInfo # We now have aggregated subscription information for this dataset in subInfo # Distribute it by site if not subInfo: #Nothing to do, log and continue msg = "No subscriptions configured for dataset %s" % dataset logging.warning(msg) self.markSubscribed.execute( dataset, subscribed=self.terminalSubscriptionState, conn=myThread.transaction.conn, transaction=True) continue # Make sure that a site is not configured both as non custodial and custodial # Non-custodial is believed to be the right choice subInfo["CustodialSites"] = list( set(subInfo["CustodialSites"]) - set(subInfo["NonCustodialSites"])) for site in subInfo["CustodialSites"]: if site not in siteMap: siteMap[site] = {} autoApprove = False if site in subInfo["AutoApproveSites"]: autoApprove = True if self.safeMode and dataset not in partiallySubscribedSet: tupleKey = (subInfo["Priority"], True, autoApprove, False) else: tupleKey = (subInfo["Priority"], True, autoApprove, True) if tupleKey not in siteMap[site]: siteMap[site][tupleKey] = [] # Subscriptions are sorted by options, defined by tupleKey # The tuple key has 3 or 4 entries in this order # Priority, Custodial, Auto approve, Move (True) or Replica (False) siteMap[site][tupleKey].append(dataset) # If we are in safe mode and this is a partially subscribed dataset, # then the non-custodial were done in a previous cycle if self.safeMode and dataset in partiallySubscribedSet: self.markSubscribed.execute( dataset, subscribed=self.terminalSubscriptionState, conn=myThread.transaction.conn, transaction=True) continue for site in subInfo["NonCustodialSites"]: if site not in siteMap: siteMap[site] = {} autoApprove = False if site in subInfo["AutoApproveSites"]: autoApprove = True # Non-custodial is never move, so this tuple has only 3 entries # TODO: Change tuples to frozensets for clarity tupleKey = (subInfo["Priority"], False, autoApprove) if tupleKey not in siteMap[site]: siteMap[site][tupleKey] = [] siteMap[site][tupleKey].append(dataset) self.markSubscribed.execute(dataset, subscribed=1, conn=myThread.transaction.conn, transaction=True) # Actually request the subscriptions for site in siteMap: # Check that the site is valid if site not in self.cmsToPhedexMap: msg = "Site %s doesn't appear to be valid to PhEDEx" % site logging.error(msg) self.sendAlert(7, msg=msg) continue for subscriptionFlavor in siteMap[site]: datasets = siteMap[site][subscriptionFlavor] # Check that the site is valid isMSS = False if "MSS" in self.cmsToPhedexMap[site]: isMSS = True phedexNode = self.cmsToPhedexMap[site]["MSS"] else: phedexNode = self.cmsToPhedexMap[site]["Disk"] logging.info("Subscribing %s to %s" % (datasets, site)) options = { "custodial": "n", "requestOnly": "y", "priority": subscriptionFlavor[0].lower(), "move": "n" } if subscriptionFlavor[1] and isMSS: # Custodial subscriptions are only allowed in MSS nodes # If custodial is requested on Non-MSS it fallsback to a non-custodial subscription options["custodial"] = "y" if subscriptionFlavor[3] and not self.replicaOnly: options["move"] = "y" if subscriptionFlavor[2]: options["requestOnly"] = "n" logging.info( "Request options: Custodial - %s, Move - %s, Request Only - %s" % (options["custodial"].upper(), options["move"].upper(), options["requestOnly"].upper())) newSubscription = PhEDExSubscription(datasets, phedexNode, self.group, **options) xmlData = XMLDrop.makePhEDExXMLForDatasets( self.dbsUrl, newSubscription.getDatasetPaths()) logging.debug(str(xmlData)) self.phedex.subscribe(newSubscription, xmlData) myThread.transaction.commit() return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory(classname = "Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname = "Jobs.LoadFromID") self.getFullJobInfo = self.daofactory(classname = "Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory(classname = "Jobs.GetFWJRTaskName") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname = "ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://","") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning("Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute(jobId = jobReport.getJobID(), conn = self.getDBConn(), transaction = self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info("TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID = job["id"], fwkJobReport = fwkJobReport) if self.returnJobReport: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport}) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute(binds = self.filesetAssoc, conn = self.getDBConn(), transaction = self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobCompleteInput.execute(id = idList, lfnsToSkip = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute(binds = self.parentageBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute(binds = self.parentageBindsForMerge, conn = self.getDBConn(), transaction = self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction = False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append(outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn = jobReportFile["lfn"], size = jobReportFile["size"], events = jobReportFile["events"], checksums = jobReportFile["checksums"], status = "NOTUPLOADED") dbsFile.setAlgorithm(appName = datasetInfo["applicationName"], appVer = datasetInfo["applicationVersion"], appFam = jobReportFile["module_label"], psetHash = "GIBBERISH", configContent = jobReportFile.get('configURL')) dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus(validStatus = jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver = jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra(era = jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag = jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id = jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber = run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn = list(jobReportFile["locations"])[0], immediateSave = False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute([lfn], conn = self.getDBConn(), transaction = self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn = parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID = None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file = fwjrFile, jobID = jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS(fwjrFile, task) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName(file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) jobType = self.getJobTypeAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len(outputMap.keys()) == 0 and outputModules == set(['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set(outputMap.keys()).difference(outputModules) == set(['write_RAW']): pass else: failJob = True if jobType in [ "Processing", "Production" ]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error("Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') else: logging.debug("Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.error("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats(fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error("Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id = jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID = jobID, task = fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputID}) # LogCollect jobs have no output fileset if jobType != "LogCollect": outputFilesets = self.outputFilesetsForJob(outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputFileset}) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView("FWJRDump", 'jobsByOutputLFN', options = {"stale": "update_after"}, keys = keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error("job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode = 999, errorDescription = 'Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID}) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute(workflowName, taskPath, conn = self.getDBConn(), transaction = self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({'workflowPath': workflowPath, 'workflowID': workflowID}) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'sename' : jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute(siteName = jobLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files = dbsFileTuples, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetLocation.execute(binds = dbsFileLoc, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetChecksum.execute(bulkList = dbsCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" %dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: fileLocations.append({'lfn': lfn, 'location': wmbsFile.getLocations()[0]}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) fileCreate.append([lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged']]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files = fileCreate, conn = self.getDBConn(), transaction = self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileAddChecksum.execute(bulkList = fileCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileLocation.execute(lfn = fileLocations, location = self.fileLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error("Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn = pnn, immediateSave = False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn = lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute(binds = bindList, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute([{'jobid' : x} for x in self.jobsWithSkippedFiles.keys()], fileSelection = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask = False) return
class RequestQuery: def __init__(self, config): self.br = Browser() self.config = config # Initialise connections self.phedex = PhEDEx({"endpoint": "https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json") self.dbsPhys01 = DbsApi(url=dbs_base_url + "phys01/DBSReader/") self.dbsPhys02 = DbsApi(url=dbs_base_url + "phys02/DBSReader/") self.dbsPhys03 = DbsApi(url=dbs_base_url + "phys03/DBSReader/") def __del__(self): self.br.close() def getScramArchByCMSSW(self): """ Get from the list of available CMSSW releases return a dictionary of ScramArchitecture by CMSSW """ # Set temporary conection to the server and get the response from cmstags url = "https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML" br = Browser() br.set_handle_robots(False) response = br.open(url) soup = BeautifulSoup(response.read()) # Dictionary form # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... } archByCmssw = {} # Fill the dictionary for arch in soup.find_all("architecture"): for cmssw in arch.find_all("project"): # CMSSW release cmsswLabel = cmssw.get("label").encode("ascii", "ignore") if cmsswLabel not in archByCmssw: archByCmssw[cmsswLabel] = [] # ScramArch related to this CMSSW release archName = arch.get("name").encode("ascii", "ignore") archByCmssw[cmsswLabel].append(archName) return archByCmssw def getDatasetOriginSites(self, dbs_url, data): """ Get the origin sites for each block of the dataset. Return a list block origin sites. """ sites = [] local_dbs = dbs_url.split("/")[5] if local_dbs == "phys01": response = self.dbsPhys01.listBlocks(detail=True, dataset=data) elif local_dbs == "phys02": response = self.dbsPhys02.listBlocks(detail=True, dataset=data) elif local_dbs == "phys03": response = self.dbsPhys03.listBlocks(detail=True, dataset=data) seList = [] for block in response: if block["origin_site_name"] not in seList: seList.append(block["origin_site_name"]) siteNames = [] for node in self.nodeMappings["phedex"]["node"]: if node["se"] in seList: siteNames.append(node["name"]) return siteNames, seList def phEDExNodetocmsName(self, nodeList): """ Convert PhEDEx node name list to cms names list """ names = [] for node in nodeList: name = node.replace("_MSS", "").replace("_Disk", "").replace("_Buffer", "").replace("_Export", "") if name not in names: names.append(name) return names def setGlobalTagFromOrigin(self, dbs_url, input_dataset): """ Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN' """ globalTag = "" local_dbs = dbs_url.split("/")[5] if local_dbs == "phys01": response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset) elif local_dbs == "phys02": response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset) elif local_dbs == "phys03": response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset) globalTag = response[0]["global_tag"] # GlobalTag cannot be empty if globalTag == "": globalTag = "UNKNOWN" return globalTag def isDataAtUrl(self, dbs_url, input_dataset): """ Returns True if the dataset is at the dbs url, if not returns False """ local_dbs = dbs_url.split("/")[5] if local_dbs == "phys01": response = self.dbsPhys01.listDatasets(dataset=input_dataset) elif local_dbs == "phys02": response = self.dbsPhys02.listDatasets(dataset=input_dataset) elif local_dbs == "phys03": response = self.dbsPhys03.listDatasets(dataset=input_dataset) # This means that the dataset is not at the url if not response: return False else: return True def getLabelByValueDict(self, control): """ From control items, create a dictionary by values """ d = {} for item in control.items: value = item.attrs["value"] label = item.attrs["label"] d[value] = label return d def getValueByLabelDict(self, control): """ From control items, create a dictionary by labels """ d = {} for item in control.items: value = item.attrs["value"] label = item.attrs["label"] d[label] = value return d def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version=1): """ Creates a JSON file 'Ticket_#TICKET.json' with the needed information for creating a requeston ReqMgr. Input: - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773 - input_dataset - dbs_url: only the instance name, For example: "phys01" for https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader - cmssw_release - group_name: the physics group name - version: the dataset version, 1 by default. It returns a dictionary that contains the request information. """ scramArchByCMSSW = self.getScramArchByCMSSW() self.nodeMappings = self.phedex.getNodeMap() task = ticket print "Processing ticket: %s" % task # splitting input dataset input_primary_dataset = input_dataset.split("/")[1].replace(" ", "") input_processed_dataset = input_dataset.split("/")[2].replace(" ", "") data_tier = input_dataset.split("/")[3].replace(" ", "") # Transform input value to a valid DBS url # dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader" dbs_url = dbs_base_url + dbs_url + "/DBSReader" release_id = cmssw_release # check if deprecated release was used release = cmssw_release # check if release has not ScramArch match if release not in scramArchByCMSSW: raise Exception("Error on ticket %s due to ScramArch mismatch" % task) else: scram_arch = scramArchByCMSSW[release][-1] # check if dataset is not at dbs url try: data_at_url = self.isDataAtUrl(dbs_url, input_dataset) except: raise Exception("Error on ticket %s, dataset %s not available at %s" % (task, input_dataset, dbs_url)) if not data_at_url: raise Exception("Error on ticket %s, dataset %s not available at %s" % (task, input_dataset, dbs_url)) ## Get Physics Group group_squad = "cms-storeresults-" + group_name.replace("-", "_").lower() ## Get Dataset Version dataset_version = str(version) # Set default Adquisition Era for StoreResults acquisitionEra = "StoreResults" ## Construction of the new dataset name (ProcessingString) ## remove leading hypernews or physics group name and StoreResults+Version if input_processed_dataset.find(group_name) == 0: new_dataset = input_processed_dataset.replace(group_name, "", 1) else: stripped_dataset = input_processed_dataset.split("-")[1:] new_dataset = "_".join(stripped_dataset) # Get dataset site info: phedex_map, se_names = self.getDatasetOriginSites(dbs_url, input_dataset) sites = self.phEDExNodetocmsName(phedex_map) infoDict = {} # Build store results json # First add all the defaults values infoDict["RequestType"] = "StoreResults" infoDict["UnmergedLFNBase"] = "/store/unmerged" infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-", "_").lower() infoDict["MinMergeSize"] = 1500000000 infoDict["MaxMergeSize"] = 5000000000 infoDict["MaxMergeEvents"] = 100000 infoDict["TimePerEvent"] = 40 infoDict["SizePerEvent"] = 512.0 infoDict["Memory"] = 2394 infoDict["CmsPath"] = "/uscmst1/prod/sw/cms" infoDict["Group"] = "DATAOPS" infoDict["DbsUrl"] = dbs_url # Add all the information pulled from Savannah infoDict["AcquisitionEra"] = acquisitionEra infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset) infoDict["DataTier"] = data_tier infoDict["InputDataset"] = input_dataset infoDict["ProcessingString"] = new_dataset infoDict["CMSSWVersion"] = release infoDict["ScramArch"] = scram_arch infoDict["ProcessingVersion"] = dataset_version infoDict["SiteWhitelist"] = list(sites) # Create report for Migration2Global report = {} # Fill json file, if status is done self.writeJSONFile(task, infoDict) report["json"] = "y" report["task"] = int(task) report["InputDataset"] = input_dataset report["ProcessingString"] = new_dataset report["localUrl"] = dbs_url report["sites"] = list(sites) report["se_names"] = list(se_names) return report def writeJSONFile(self, task, infoDict): """ This writes a JSON file at ComponentDir """ ##check if file already exists filename = self.config["ComponentDir"] + "/Ticket_" + str(task) + ".json" if not os.access(filename, os.F_OK): jsonfile = open(filename, "w") request = {"createRequest": infoDict} ## CHECK THIS BEFORE FINISHING jsonfile.write(json.dumps(request, sort_keys=True, indent=4)) jsonfile.close return def removeJSONFile(self, task): """ This removes the JSON file at ComponentDir if it was created """ filename = self.config["ComponentDir"] + "/Ticket_" + str(task) + ".json" if os.access(filename, os.F_OK): os.remove(filename) return def printReport(self, report): """ Print out a report """ print "%20s %5s %10s %50s %50s" % ("Ticket", "json", "local DBS", "Sites", "se_names") print "%20s %5s %10s %50s %50s" % ("-" * 20, "-" * 5, "-" * 10, "-" * 50, "-" * 50) json = report["json"] ticket = report["task"] # status = report["ticketStatus"] localUrl = report["localUrl"].split("/")[5] site = ", ".join(report["sites"]) se_names = ", ".join(report["se_names"]) print "%20s %5s %10s %50s %50s" % (ticket, json, localUrl, site, se_names)
def testNormalModeSubscriptions(self): """ _testNormalModeSubscriptions_ Tests that we can make custodial/non-custodial subscriptions on normal operation mode, this time we don't need WMBS for anything. All is subscribed in one go. Check that the requests are correct. """ self.stuffDatabase() config = self.createConfig() phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(2) try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(4) nodeMappings = phedex.getNodeMap() subscriber = PhEDExInjectorSubscriber(config, phedex, nodeMappings) subscriber.setup({}) subscriber.algorithm({}) phedexInstance = subscriber.phedex subscriptions = phedexInstance.subRequests # Let's check /BogusPrimary/Run2012Z-PromptReco-v1/RECO # According to the spec, this should be custodial at T1_US_FNAL # Non-custodial at T1_UK_RAL and T3_CO_Uniandes # Autoapproved in all sites # Priority is normal self.assertTrue(self.testDatasetA in subscriptions, "Dataset A was not subscribed") subInfoA = subscriptions[self.testDatasetA] self.assertEqual(len(subInfoA), 3, "Dataset A was not subscribed to all sites") for subInfo in subInfoA: site = subInfo["node"] self.assertEqual(subInfo["priority"], "normal", "Wrong priority for subscription") if site == "T1_UK_RAL_MSS" or site == "T3_CO_Uniandes": self.assertEqual(subInfo["custodial"], "n", "Wrong custodiality for dataset A at %s" % subInfo["node"]) self.assertEqual(subInfo["request_only"], "n", "Wrong requestOnly for dataset A at %s" % subInfo["node"]) self.assertEqual(subInfo["move"], "n", "Wrong subscription type for dataset A at %s" % subInfo["node"]) elif site == "T1_US_FNAL_MSS": self.assertEqual(subInfo["custodial"], "y", "Wrong custodiality for dataset A at %s" % subInfo["node"]) self.assertEqual(subInfo["request_only"], "n", "Wrong requestOnly for dataset A at %s" % subInfo["node"]) self.assertEqual(subInfo["move"], "y", "Wrong subscription type for dataset A at %s" % subInfo["node"]) else: self.fail("Dataset A was subscribed to a wrong site %s" % site) # Now check /BogusPrimary/CRUZET11-v1/RAW # According to the spec, this is not custodial anywhere # Non-custodial at T1_UK_RAL and T2_CH_CERN # Request only at both sites and with high priority self.assertTrue(self.testDatasetB in subscriptions, "Dataset B was not subscribed") subInfoB = subscriptions[self.testDatasetB] self.assertEqual(len(subInfoB), 2, "Dataset B was not subscribed to all sites") for subInfo in subInfoB: site = subInfo["node"] self.assertEqual(subInfo["priority"], "high", "Wrong priority for subscription") if site == "T1_UK_RAL_MSS" or site == "T2_CH_CERN": self.assertEqual(subInfo["custodial"], "n", "Wrong custodiality for dataset B at %s" % subInfo["node"]) self.assertEqual(subInfo["request_only"], "y", "Wrong requestOnly for dataset B at %s" % subInfo["node"]) self.assertEqual(subInfo["move"], "n", "Wrong subscription type for dataset B at %s" % subInfo["node"]) else: self.fail("Dataset B was subscribed to a wrong site %s" % site) myThread = threading.currentThread() result = myThread.dbi.processData("SELECT COUNT(*) FROM dbsbuffer_dataset_subscription where subscribed = 1")[0].fetchall() self.assertEqual(result[0][0], 5, "Not all datasets were marked as subscribed") result = myThread.dbi.processData("SELECT site FROM dbsbuffer_dataset_subscription where subscribed = 0")[0].fetchall() self.assertEqual(result[0][0], "T1_IT_CNAF", "A non-valid CMS site was subscribed") # Reset and run again and make sure that no duplicate subscriptions are created myThread.dbi.processData("UPDATE dbsbuffer_dataset_subscription SET subscribed = 0") subscriber.algorithm({}) self.assertEqual(len(subscriptions[self.testDatasetA]), 3) self.assertEqual(len(subscriptions[self.testDatasetB]), 2) return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning( "Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute( jobId=jobReport.getJobID(), conn=self.getDBConn(), transaction=self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % ( jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info( "TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID=job["id"], fwkJobReport=fwkJobReport) if self.returnJobReport: returnList.append({ 'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport }) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute( binds=self.filesetAssoc, conn=self.getDBConn(), transaction=self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobCompleteInput.execute( id=idList, lfnsToSkip=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute( binds=self.parentageBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute( binds=self.parentageBindsForMerge, conn=self.getDBConn(), transaction=self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction=False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append( outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task, errorDataset=False): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"], size=jobReportFile["size"], events=jobReportFile["events"], checksums=jobReportFile["checksums"], status="NOTUPLOADED") dbsFile.setAlgorithm(appName=datasetInfo["applicationName"], appVer=datasetInfo["applicationVersion"], appFam=jobReportFile["module_label"], psetHash="GIBBERISH", configContent=jobReportFile.get('configURL')) if errorDataset: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"] + "-Error", datasetInfo["processedDataset"], datasetInfo["dataTier"])) else: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus( validStatus=jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra( era=jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber=run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0], immediateSave=False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute( [lfn], conn=self.getDBConn(), transaction=self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn=parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile, jobID=jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS( fwjrFile, task, jobType == "Repack" and fwjrFile["size"] > self.maxAllowedRepackOutputSize) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName( file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) jobType = self.getJobTypeAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len( outputMap.keys()) == 0 and outputModules == set( ['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set( outputMap.keys()).difference(outputModules) == set( ['write_RAW']): pass else: failJob = True if jobType in ["Processing", "Production"]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error( "Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep( step='logArch1') else: logging.debug( "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step='logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.warning("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats( fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error( "Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id=jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID=jobID, task=fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputID }) # LogCollect jobs have no output fileset if jobType == "LogCollect": pass # Repack jobs that wrote too large merged output skip output filesets elif jobType == "Repack" and merged and wmbsFile[ "size"] > self.maxAllowedRepackOutputSize: pass else: outputFilesets = self.outputFilesetsForJob( outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputFileset }) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles and jobType not in ['LogCollect', 'Cleanup']: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView( "FWJRDump", 'jobsByOutputLFN', options={"stale": "update_after"}, keys=keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error( "job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % ( dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({ 'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID }) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute( workflowName, taskPath, conn=self.getDBConn(), transaction=self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({ 'workflowPath': workflowPath, 'workflowID': workflowID }) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute( siteName=jobLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files=dbsFileTuples, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetLocation.execute(binds=dbsFileLoc, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetChecksum.execute(bulkList=dbsCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" % dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: outpnn = wmbsFile.getLocations()[0] if self.pnn_to_psn.get(outpnn, None): fileLocations.append({'lfn': lfn, 'location': outpnn}) else: msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn logging.error(msg) raise AccountantWorkerException(msg) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) fileCreate.append([ lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged'] ]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files=fileCreate, conn=self.getDBConn(), transaction=self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileAddChecksum.execute( bulkList=fileCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileLocation.execute( lfn=fileLocations, location=self.fileLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error( "Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn=pnn, immediateSave=False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn=lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute( binds=bindList, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute( [{ 'jobid': x } for x in self.jobsWithSkippedFiles.keys()], fileSelection=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask=False) return
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.dbsUrl = config.DBSInterface.globalDBSUrl self.pollCounter = 0 self.subFrequency = None if getattr(config.PhEDExInjector, "subscribeDatasets", False): pollInterval = config.PhEDExInjector.pollInterval subInterval = config.PhEDExInjector.subscribeInterval self.subFrequency = max(1, int(round(subInterval / pollInterval))) logging.info( "SubscribeDataset and deleteBlocks will run every %d polling cycles", self.subFrequency) # subscribe on first cycle self.pollCounter = self.subFrequency - 1 # retrieving the node mappings is fickle and can fail quite often self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(2) try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(4) nodeMappings = self.phedex.getNodeMap() # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] for node in nodeMappings["phedex"]["node"]: if node["kind"] not in self.seMap: self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s", node["se"], node["name"]) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) self.phedexNodes = {'MSS': [], 'Disk': []} for node in nodeMappings["phedex"]["node"]: if node["kind"] in ["MSS", "Disk"]: self.phedexNodes[node["kind"]].append(node["name"]) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="PhEDExInjector") self.blocksToRecover = [] return def setup(self, parameters): """ _setup_ Create DAO Factory and setup some DAO. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.PhEDExInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks") self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted") self.getUnsubscribed = daofactory(classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") daofactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname="SetBlockClosed") return def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ logging.info("Running PhEDEx injector poller algorithm...") self.pollCounter += 1 if self.blocksToRecover: logging.info("""PhEDExInjector Recovery: previous injection call failed, check if files were injected to PhEDEx anyway""") self.recoverInjectedFiles() self.injectFiles() self.closeBlocks() if self.pollCounter == self.subFrequency: self.pollCounter = 0 self.deleteBlocks() self.subscribeDatasets() return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[ datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for f in fileBlock["files"]: blockSpec.addFile(f["lfn"], f["checksum"], f["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns [{"block1": set(["lfn1", "lfn2"])}, {"block2": set(["lfn3", "lfn4"])] """ blocks = [] for datasetPath in unInjectedData: for blockName, fileBlock in unInjectedData[datasetPath].items(): newBlock = {blockName: set()} for fileDict in fileBlock["files"]: newBlock[blockName].add(fileDict["lfn"]) blocks.append(newBlock) return blocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ logging.info("Starting injectFiles method") uninjectedFiles = self.getUninjected.execute() for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg=msg) continue maxDataset = 20 maxBlocks = 50 maxFiles = 5000 numberDatasets = 0 numberBlocks = 0 numberFiles = 0 injectData = {} lfnList = [] for dataset in uninjectedFiles[siteName]: numberDatasets += 1 injectData[dataset] = uninjectedFiles[siteName][dataset] for block in injectData[dataset]: numberBlocks += 1 numberFiles += len(injectData[dataset][block]['files']) for fileInfo in injectData[dataset][block]['files']: lfnList.append(fileInfo['lfn']) if numberDatasets >= maxDataset or numberBlocks >= maxBlocks or numberFiles >= maxFiles: self.injectFilesPhEDExCall(location, injectData, lfnList) numberDatasets = 0 numberBlocks = 0 numberFiles = 0 injectData = {} lfnList = [] if injectData: self.injectFilesPhEDExCall(location, injectData, lfnList) return def injectFilesPhEDExCall(self, location, injectData, lfnList): """ _injectFilesPhEDExCall_ actual PhEDEx call for file injection """ xmlData = self.createInjectionSpec(injectData) logging.debug("injectFiles XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: # HTTPException with status 400 assumed to be duplicate injection # trigger later block recovery (investgation needed if not the case) if ex.status == 400: self.blocksToRecover.extend( self.createRecoveryFileFormat(injectData)) logging.error( "PhEDEx file injection failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx file injection failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: logging.info("Injection result: %s", injectRes) if "error" in injectRes: msg = "Error injecting data %s: %s" % (injectData, injectRes["error"]) logging.error(msg) self.sendAlert(6, msg=msg) else: try: self.setStatus.execute(lfnList, 1) except: # possible deadlock with DBS3Upload, retry once after 5s logging.warning( "Oracle exception during file status update, possible deadlock due to race condition, retry after 5s sleep" ) time.sleep(5) self.setStatus.execute(lfnList, 1) return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS """ logging.info("Starting closeBlocks method") migratedBlocks = self.getMigrated.execute() for siteName in migratedBlocks.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(6, msg=msg) continue xmlData = self.createInjectionSpec(migratedBlocks[siteName]) logging.debug("closeBlocks XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: logging.error( "PhEDEx block close failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx block close failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: logging.info("Block closing result: %s", injectRes) if "error" not in injectRes: for datasetName in migratedBlocks[siteName]: for blockName in migratedBlocks[siteName][datasetName]: logging.debug("Closing block %s", blockName) self.setBlockClosed.execute(blockName) else: msg = "Error injecting data %s: %s" % ( migratedBlocks[siteName], injectRes["error"]) logging.error(msg) self.sendAlert(6, msg=msg) return def recoverInjectedFiles(self): """ When PhEDEx inject call timed out, run this function. Since there are 3 min reponse time out in cmsweb, some times PhEDEx injection call times out even though the call succeeded In that case run the recovery mode 1. first check whether files which injection status = 0 are in the PhEDEx. 2. if those file exist set the in_phedex status to 1 3. set self.blocksToRecover = [] Run this recovery one block at a time, with too many blocks the call to the PhEDEx data service on cmsweb can time out """ # recover one block at a time for block in self.blocksToRecover: injectedFiles = self.phedex.getInjectedFiles(block) if injectedFiles: self.setStatus.execute(injectedFiles, 1) self.blocksToRecover = [] return def deleteBlocks(self): """ _deleteBlocks_ Find deletable blocks, then decide if to delete based on: Is there an active subscription for dataset or block ? If yes => set deleted=2 If no => next check Has transfer to all destinations finished ? If yes => request block deletion, approve request, set deleted=1 If no => do nothing (check again next cycle) """ logging.info("Starting deleteBlocks method") blockDict = self.findDeletableBlocks.execute(transaction=False) if not blockDict: return try: subscriptions = self.phedex.getSubscriptionMapping( *blockDict.keys()) except: logging.error( "Couldn't get subscription info from PhEDEx, retry next cycle") return skippableBlocks = [] deletableEntries = {} for blockName in blockDict: location = blockDict[blockName]['location'] # should never be triggered, better safe than sorry if location.endswith('_MSS'): logging.debug("Location %s for block %s is MSS, skip deletion", location, blockName) skippableBlocks.append(blockName) continue dataset = blockDict[blockName]['dataset'] sites = blockDict[blockName]['sites'] if blockName in subscriptions and location in subscriptions[ blockName]: logging.debug("Block %s subscribed to %s, skip deletion", blockName, location) binds = {'DELETED': 2, 'BLOCKNAME': blockName} self.markBlocksDeleted.execute(binds) else: blockInfo = [] try: blockInfo = self.phedex.getReplicaInfoForBlocks( block=blockName, complete='y')['phedex']['block'] except: logging.error( "Couldn't get block info from PhEDEx, retry next cycle" ) else: for entry in blockInfo: if entry['name'] == blockName: nodes = set([x['node'] for x in entry['replica']]) if location not in nodes: logging.debug( "Block %s not present on %s, mark as deleted", blockName, location) binds = {'DELETED': 1, 'BLOCKNAME': blockName} self.markBlocksDeleted.execute(binds) elif sites.issubset(nodes): logging.debug( "Deleting block %s from %s since it is fully transfered", blockName, location) if location not in deletableEntries: deletableEntries[location] = {} if dataset not in deletableEntries[location]: deletableEntries[location][dataset] = set() deletableEntries[location][dataset].add( blockName) binds = [] for blockName in skippableBlocks: binds.append({'DELETED': 2, 'BLOCKNAME': blockName}) if binds: self.markBlocksDeleted.execute(binds) for location in deletableEntries: chunkSize = 100 numberOfBlocks = 0 blocksToDelete = {} for dataset in deletableEntries[location]: blocksToDelete[dataset] = deletableEntries[location][dataset] numberOfBlocks += len(blocksToDelete[dataset]) if numberOfBlocks > chunkSize: self.deleteBlocksPhEDExCalls(location, blocksToDelete) numberOfBlocks = 0 blocksToDelete = {} self.deleteBlocksPhEDExCalls(location, blocksToDelete) return def deleteBlocksPhEDExCalls(self, location, blocksToDelete): """ _deleteBlocksPhEDExCalls_ actual PhEDEx calls for block deletion """ deletion = PhEDExDeletion( blocksToDelete.keys(), location, level='block', comments="WMAgent blocks auto-delete from %s" % location, blocks=blocksToDelete) xmlData = XMLDrop.makePhEDExXMLForBlocks( self.dbsUrl, deletion.getDatasetsAndBlocks()) logging.debug("deleteBlocks XMLData: %s", xmlData) try: response = self.phedex.delete(deletion, xmlData) requestId = response['phedex']['request_created'][0]['id'] # auto-approve deletion request self.phedex.updateRequest(requestId, 'approve', location) except HTTPException as ex: logging.error( "PhEDEx block delete/approval failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error( "PhEDEx block delete/approval failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: binds = [] for dataset in blocksToDelete: for blockName in blocksToDelete[dataset]: binds.append({'DELETED': 1, 'BLOCKNAME': blockName}) self.markBlocksDeleted.execute(binds) return def subscribeDatasets(self): """ _subscribeDatasets_ Poll the database for datasets and subscribe them. """ logging.info("Starting subscribeDatasets method") # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute() # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create a list of subscriptions as defined by the PhEDEx data structures subs = SubscriptionList() # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: site = subInfo['site'] if site not in self.phedexNodes[ 'MSS'] and site not in self.phedexNodes['Disk']: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping subscription: %s" % subInfo['id'] logging.error(msg) self.sendAlert(7, msg=msg) continue # Avoid custodial subscriptions to disk nodes if site not in self.phedexNodes['MSS']: subInfo['custodial'] = 'n' # Avoid auto approval in T1 sites elif site.startswith("T1"): subInfo['request_only'] = 'y' phedexSub = PhEDExSubscription( subInfo['path'], site, subInfo['phedex_group'], priority=subInfo['priority'], move=subInfo['move'], custodial=subInfo['custodial'], request_only=subInfo['request_only'], subscriptionId=subInfo['id']) # Check if the subscription is a duplicate if phedexSub.matchesExistingSubscription(self.phedex) or \ phedexSub.matchesExistingTransferRequest(self.phedex): subscriptionsMade.append(subInfo['id']) continue # Add it to the list subs.addSubscription(phedexSub) # Compact the subscriptions subs.compact() for subscription in subs.getSubscriptionList(): xmlData = XMLDrop.makePhEDExXMLForDatasets( self.dbsUrl, subscription.getDatasetPaths()) logging.debug("subscribeDatasets XMLData: %s", xmlData) logging.info( "Subscribing: %s to %s, with options: Move: %s, Custodial: %s, Request Only: %s", subscription.getDatasetPaths(), subscription.getNodes(), subscription.move, subscription.custodial, subscription.request_only) try: self.phedex.subscribe(subscription, xmlData) except HTTPException as ex: logging.error( "PhEDEx dataset subscribe failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error( "PhEDEx dataset subscribe failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: subscriptionsMade.extend(subscription.getSubscriptionIds()) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade) return
class TransferDaemon(BaseDaemon): """ _TransferDaemon_ Call multiprocessing library to instantiate a TransferWorker for each user. """ def __init__(self, config): """ Initialise class members: 1. check and create dropbox dir 2. define oracle and couch (config and file instance) server connection 3. PhEDEx connection 4. Setup wmcore factory """ self.doc_acq = '' # Need a better way to test this without turning off this next line BaseDaemon.__init__(self, config, 'AsyncTransfer') self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir if not os.path.isdir(self.dropbox_dir): try: os.makedirs(self.dropbox_dir) except OSError as e: if not e.errno == errno.EEXIST: self.logger.exception('Unknown error in mkdir' % e.errno) raise if not os.path.isdir("/tmp/DashboardReport"): try: os.makedirs("/tmp/DashboardReport") except OSError as e: if not e.errno == errno.EEXIST: self.logger.exception('Unknown error in mkdir' % e.errno) raise try: config_server = CouchServer(dburl=self.config.config_couch_instance) self.config_db = config_server.connectDatabase(self.config.config_database) except: self.logger.exception('Failed when contacting local couch') raise try: self.oracleDB = HTTPRequests(self.config.oracleDB, self.config.opsProxy, self.config.opsProxy) except: self.logger.exception('Failed when contacting Oracle') raise self.pool = Pool(processes=self.config.pool_size) self.factory = WMFactory(self.config.schedAlgoDir, namespace=self.config.schedAlgoDir) self.site_tfc_map = {} try: self.phedex = PhEDEx(responseType='xml', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) raise # TODO: decode xml try: self.phedex2 = PhEDEx(responseType='json', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) raise self.logger.debug(type((self.phedex2.getNodeMap())['phedex']['node'])) for site in [x['name'] for x in self.phedex2.getNodeMap()['phedex']['node']]: if site and str(site) != 'None' and str(site) != 'unknown': self.site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('tfc site: %s %s' % (site, self.get_tfc_rules(site))) # Over riding setup() is optional, and not needed here def algorithm(self, parameters=None): """ 1 Get transfer config from couchdb config instance 2. Get a list of users with files to transfer from the db instance (oracle or couch, by config flag) 3. For each user get a suitably sized input for submission (call to a list) 4. Submit to a subprocess """ if self.config.isOracle: users = self.oracleSiteUser(self.oracleDB) else: users = self.active_users(self.db) sites = self.active_sites() self.logger.info('%s active sites' % len(sites)) self.logger.debug('Active sites are: %s' % sites) self.logger.debug('kicking off pool') for u in users: for i in range(len(u)): if not u[i]: u[i] = '' self.logger.debug('current_running %s' % current_running) self.logger.debug('Testing current running: %s %s %s' % (u, current_running, (u not in current_running))) if u not in current_running: self.logger.debug('processing %s' % u) current_running.append(u) self.logger.debug('processing %s' % current_running) self.pool.apply_async(ftscp, (u, self.site_tfc_map, self.config), callback=log_result) def oracleSiteUser(self, db): """ 1. Acquire transfers from DB 2. Get acquired users and destination sites """ self.logger.info('Retrieving users...') fileDoc = dict() fileDoc['subresource'] = 'activeUsers' fileDoc['grouping'] = 0 fileDoc['asoworker'] = self.config.asoworker result = dict() try: result = db.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return [] self.logger.debug(oracleOutputMapping(result)) # TODO: translate result into list((user,group,role),...) if len(oracleOutputMapping(result)) != 0: self.logger.debug(type( [[x['username'].encode('ascii','ignore'), x['user_group'], x['user_role']] for x in oracleOutputMapping(result)])) try: docs = oracleOutputMapping(result) users = [[x['username'], x['user_group'], x['user_role']] for x in docs] self.logger.info('Users to process: %s' % str(users)) except: self.logger.exception('User data malformed. ') else: self.logger.info('No new user to acquire') return [] actives = list() for user in users: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquireTransfers' fileDoc['username'] = user[0] self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user[0]) try: result = db.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" %ex) continue self.doc_acq = str(result) for i in range(len(user)): if not user[i] or user[i] in ['None', 'NULL']: user[i] = '' user[i] = str(user[i]) actives.append(user) self.logger.debug("Transfers retrieved from oracleDB. %s " % users) return users def active_users(self, db): """ Query a view for users with files to transfer. get this from the following view: ftscp?group=true&group_level=1 """ query = {'group': True, 'group_level': 3} try: users = db.loadView(self.config.ftscp_design, 'ftscp_all', query) except Exception as e: self.logger.exception('A problem occured when\ contacting couchDB: %s' % e) return [] if len(users['rows']) <= self.config.pool_size: active_users = [x['key'] for x in users['rows']] else: sorted_users = self.factory.loadObject(self.config.algoName, args=[self.config, self.logger, users['rows'], self.config.pool_size], getFromCache=False, listFlag=True) active_users = sorted_users()[:self.config.pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) return active_users def active_sites(self): """ Get a list of all sites involved in transfers. """ query = {'group': True, 'stale': 'ok'} try: sites = self.db.loadView('AsyncTransfer', 'sites', query) except Exception as e: self.logger.exception('A problem occured \ when contacting couchDB: %s' % e) return [] def keys_map(inputDict): """ Map function. """ return inputDict['key'] return map(keys_map, sites['rows']) def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ tfc_file = None try: self.phedex.getNodeTFC(site) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception as e: self.logger.exception('PhEDEx cache exception: %s' % e) return readTFC(tfc_file) def terminate(self, parameters=None): """ Called when thread is being terminated. """ self.pool.close() self.pool.join()
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.siteDB = SiteDBJSON() self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") self.safeMode = getattr(config.PhEDExInjector, "safeOperationMode", False) # Subscribed state in the DBSBuffer table for datasets self.terminalSubscriptionState = 1 if self.safeMode: self.terminalSubscriptionState = 2 # We will map node names to CMS names, that what the spec will have. # If a CMS name is associated to many PhEDEx node then choose the MSS option self.cmsToPhedexMap = {} # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUnsubscribed = daofactory(classname = "GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname = "MarkDatasetSubscribed") self.getPartiallySubscribed = daofactory(classname = "GetPartiallySubscribedDatasets") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: cmsName = self.siteDB.phEDExNodetocmsName(node["name"]) if cmsName not in self.cmsToPhedexMap: self.cmsToPhedexMap[cmsName] = {} logging.info("Loaded PhEDEx node %s for site %s" % (node["name"], cmsName)) if node["kind"] not in self.cmsToPhedexMap[cmsName]: self.cmsToPhedexMap[cmsName][node["kind"]] = node["name"] return def algorithm(self, parameters): """ _algorithm_ Poll the database for datasets and subscribe them. """ myThread = threading.currentThread() myThread.transaction.begin() # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute(conn = myThread.transaction.conn, transaction = True) if self.safeMode: partiallySubscribedDatasets = self.getPartiallySubscribed.execute(conn = myThread.transaction.conn, transaction = True) unsubscribedDatasets.extend(partiallySubscribedDatasets) partiallySubscribedSet = set() for entry in partiallySubscribedDatasets: partiallySubscribedSet.add(entry["path"]) # Map the datasets to their specs specDatasetMap = {} for unsubscribedDataset in unsubscribedDatasets: datasetPath = unsubscribedDataset["path"] workflow = unsubscribedDataset["workflow"] spec = unsubscribedDataset["spec"] if datasetPath not in specDatasetMap: specDatasetMap[datasetPath] = [] specDatasetMap[datasetPath].append({"workflow" : workflow, "spec" : spec}) specCache = {} siteMap = {} # Distribute the subscriptions by site, type and priority # This is to make as few subscriptions as possible # Site map values are dictionaries where the keys are tuples (Prio, Custodial, AutoApprove, Move) # Where Custodial is boolean, Prio is in ["Low", "Normal", "High"], AutoApprove is boolean and Move is boolean for dataset in specDatasetMap: # Aggregate all the different subscription configurations subInfo = {} for entry in specDatasetMap[dataset]: if not entry["spec"]: # Can't use this spec, there isn't one continue # Load spec if not in the cache if entry["spec"] not in specCache: helper = WMWorkloadHelper() try: helper.load(entry["spec"]) specCache[entry["spec"]] = helper except Exception: #Couldn't load it , alert and carry on msg = "Couldn't load spec: %s" % entry["spec"] logging.error(msg) self.sendAlert(7, msg = msg) continue #If we are running in safe mode, we need to know if the workflow is ready # We have the spec, get the info helper = specCache[entry["spec"]] workflowSubInfo = helper.getSubscriptionInformation() datasetSubInfo = workflowSubInfo.get(dataset, None) if datasetSubInfo and subInfo: subInfo["CustodialSites"] = extendWithoutDups(subInfo["CustodialSites"], datasetSubInfo["CustodialSites"]) subInfo["NonCustodialSites"] = extendWithoutDups(subInfo["NonCustodialSites"], datasetSubInfo["NonCustodialSites"]) subInfo["AutoApproveSites"] = extendWithoutDups(subInfo["AutoApproveSites"], datasetSubInfo["AutoApproveSites"]) subInfo["Priority"] = solvePrioConflicts(subInfo["Priority"], datasetSubInfo["Priority"]) elif datasetSubInfo: subInfo = datasetSubInfo # We now have aggregated subscription information for this dataset in subInfo # Distribute it by site if not subInfo: #Nothing to do, log and continue msg = "No subscriptions configured for dataset %s" % dataset logging.warning(msg) self.markSubscribed.execute(dataset, subscribed = self.terminalSubscriptionState, conn = myThread.transaction.conn, transaction = True) continue # Make sure that a site is not configured both as non custodial and custodial # Non-custodial is believed to be the right choice subInfo["CustodialSites"] = list(set(subInfo["CustodialSites"]) - set(subInfo["NonCustodialSites"])) for site in subInfo["CustodialSites"]: if site not in siteMap: siteMap[site] = {} if self.safeMode and dataset not in partiallySubscribedSet: tupleKey = (subInfo["Priority"], True, False, False) else: tupleKey = (subInfo["Priority"], True, False, True) if tupleKey not in siteMap[site]: siteMap[site][tupleKey] = [] siteMap[site][tupleKey].append(dataset) # If we are in safe mode and this is a partially subscribed dataset, # then the non-custodial were done in a previous cycle if self.safeMode and dataset in partiallySubscribedSet: self.markSubscribed.execute(dataset, subscribed = self.terminalSubscriptionState, conn = myThread.transaction.conn, transaction = True) continue for site in subInfo["NonCustodialSites"]: if site not in siteMap: siteMap[site] = {} autoApprove = False if site in subInfo["AutoApproveSites"]: autoApprove = True tupleKey = (subInfo["Priority"], False, autoApprove) if tupleKey not in siteMap[site]: siteMap[site][tupleKey] = [] siteMap[site][tupleKey].append(dataset) self.markSubscribed.execute(dataset, subscribed = 1, conn = myThread.transaction.conn, transaction = True) # Actually request the subscriptions for site in siteMap: # Check that the site is valid if site not in self.cmsToPhedexMap: msg = "Site %s doesn't appear to be valid to PhEDEx" % site logging.error(msg) self.sendAlert(7, msg = msg) continue for subscriptionFlavor in siteMap[site]: datasets = siteMap[site][subscriptionFlavor] # Check that the site is valid if "MSS" in self.cmsToPhedexMap[site]: phedexNode = self.cmsToPhedexMap[site]["MSS"] else: phedexNode = self.cmsToPhedexMap[site]["Disk"] logging.info("Subscribing %s to %s" % (datasets, site)) options = {"custodial" : "n", "requestOnly" : "y", "priority" : subscriptionFlavor[0].lower(), "move" : "n"} if subscriptionFlavor[1]: options["custodial"] = "y" if subscriptionFlavor[3]: options["move"] = "y" if subscriptionFlavor[2]: options["requestOnly"] = "n" newSubscription = PhEDExSubscription(datasets, phedexNode, self.group, **options) xmlData = XMLDrop.makePhEDExXMLForDatasets(self.dbsUrl, newSubscription.getDatasetPaths()) logging.debug(str(xmlData)) self.phedex.subscribe(newSubscription, xmlData) myThread.transaction.commit() return
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.siteDB = SiteDBJSON() self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # We will map node names to CMS names, that what the spec will have. # If a CMS name is associated to many PhEDEx node then choose the MSS option self.cmsToPhedexMap = {} self.phedexNodes = {"MSS": [], "Disk": []} # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory( package="WMComponent.PhEDExInjector.Database", logger=self.logger, dbinterface=myThread.dbi ) self.getUnsubscribed = daofactory(classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: cmsName = self.siteDB.phEDExNodetocmsName(node["name"]) if cmsName not in self.cmsToPhedexMap: self.cmsToPhedexMap[cmsName] = {} logging.info("Loaded PhEDEx node %s for site %s" % (node["name"], cmsName)) if node["kind"] not in self.cmsToPhedexMap[cmsName]: self.cmsToPhedexMap[cmsName][node["kind"]] = node["name"] if node["kind"] in ["MSS", "Disk"]: self.phedexNodes[node["kind"]].append(node["name"]) return def algorithm(self, parameters): """ _algorithm_ Run the subscription algorithm as configured """ self.subscribeDatasets() return def subscribeDatasets(self): """ _subscribeDatasets_ Poll the database for datasets and subscribe them. """ myThread = threading.currentThread() myThread.transaction.begin() # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute(conn=myThread.transaction.conn, transaction=True) # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create a list of subscriptions as defined by the PhEDEx data structures subs = SubscriptionList() # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: site = subInfo["site"] if site not in self.phedexNodes["MSS"] and site not in self.phedexNodes["Disk"]: if site not in self.cmsToPhedexMap: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping subscription: %s" % subInfo["id"] logging.error(msg) self.sendAlert(7, msg=msg) continue # Get the phedex node from CMS site site = self.cmsToPhedexMap[site].get("MSS") or self.cmsToPhedexMap[site]["Disk"] # Avoid custodial subscriptions to disk nodes if site not in self.phedexNodes["MSS"]: subInfo["custodial"] = "n" # Avoid auto approval in T1 sites elif site.startswith("T1"): subInfo["request_only"] = "y" phedexSub = PhEDExSubscription( subInfo["path"], site, self.group, priority=subInfo["priority"], move=subInfo["move"], custodial=subInfo["custodial"], request_only=subInfo["request_only"], subscriptionId=subInfo["id"], ) # Check if the subscription is a duplicate if phedexSub.matchesExistingSubscription(self.phedex) or phedexSub.matchesExistingTransferRequest( self.phedex ): subscriptionsMade.append(subInfo["id"]) continue # Add it to the list subs.addSubscription(phedexSub) # Compact the subscriptions subs.compact() for subscription in subs.getSubscriptionList(): try: xmlData = XMLDrop.makePhEDExXMLForDatasets(self.dbsUrl, subscription.getDatasetPaths()) logging.debug(str(xmlData)) msg = "Subscribing: %s to %s, with options: " % ( subscription.getDatasetPaths(), subscription.getNodes(), ) msg += "Move: %s, Custodial: %s, Request Only: %s" % ( subscription.move, subscription.custodial, subscription.request_only, ) logging.info(msg) self.phedex.subscribe(subscription, xmlData) except Exception as ex: logging.error("Something went wrong when communicating with PhEDEx, will try again later.") logging.error("Exception: %s" % str(ex)) else: subscriptionsMade.extend(subscription.getSubscriptionIds()) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() return
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.enabled = getattr(config.PhEDExInjector, "enabled", True) self.dbsUrl = config.DBSInterface.globalDBSUrl self.phedexGroup = config.PhEDExInjector.phedexGroup self.pollCounter = 0 self.subFrequency = None if getattr(config.PhEDExInjector, "subscribeDatasets", False): pollInterval = config.PhEDExInjector.pollInterval subInterval = config.PhEDExInjector.subscribeInterval self.subFrequency = max(1, int(round(subInterval / pollInterval))) logging.info( "SubscribeDataset and deleteBlocks will run every %d polling cycles", self.subFrequency) # subscribe on first cycle self.pollCounter = self.subFrequency - 1 # retrieving the node mappings is fickle and can fail quite often self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json", dbsUrl=self.dbsUrl) try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(2) try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(4) nodeMappings = self.phedex.getNodeMap() # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] for node in nodeMappings["phedex"]["node"]: if node["kind"] not in self.seMap: self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s", node["se"], node["name"]) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) self.phedexNodes = {'MSS': [], 'Disk': []} for node in nodeMappings["phedex"]["node"]: if node["kind"] in ["MSS", "Disk"]: self.phedexNodes[node["kind"]].append(node["name"]) self.blocksToRecover = [] # X-component configuration is BAD! But it will only be here during the # Rucio commissioning within WM self.listTiersToSkip = config.RucioInjector.listTiersToInject logging.info( "Component configured to skip data injection for data tiers: %s", self.listTiersToSkip) return def setup(self, parameters): """ _setup_ Create DAO Factory and setup some DAO. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.RucioInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") self.getUnsubscribedBlocks = daofactory( classname="GetUnsubscribedBlocks") self.setBlockRules = daofactory(classname="SetBlocksRule") self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks") self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted") self.getUnsubscribed = daofactory(classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") daofactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname="SetBlockClosed") return @timeFunction def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ if not self.enabled: logging.info( "PhEDExInjector component is disabled in the configuration, exiting." ) return logging.info("Running PhEDEx injector poller algorithm...") self.pollCounter += 1 try: if self.blocksToRecover: logging.info("""PhEDExInjector Recovery: previous injection call failed, checking if files were injected to PhEDEx anyway""" ) self.recoverInjectedFiles() self.injectFiles() self.closeBlocks() if self.pollCounter == self.subFrequency: self.pollCounter = 0 self.deleteBlocks() self.subscribeDatasets() self.subscribeBlocks() except HTTPException as ex: if hasattr(ex, "status") and ex.status in [502, 503]: # then either proxy error or service is unavailable msg = "Caught HTTPException in PhEDExInjector. Retrying in the next cycle.\n" msg += str(ex) logging.error(msg) else: msg = "Caught unexpected HTTPException in PhEDExInjector.\n%s" % str( ex) logging.exception(msg) raise except Exception as ex: msg = "Caught unexpected exception in PhEDExInjector. Details:\n%s" % str( ex) logging.exception(msg) raise PhEDExInjectorException(msg) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[ datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for f in fileBlock["files"]: blockSpec.addFile(f["lfn"], f["checksum"], f["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns [{"block1": set(["lfn1", "lfn2"])}, {"block2": set(["lfn3", "lfn4"])] """ blocks = [] for datasetPath in unInjectedData: for blockName, fileBlock in unInjectedData[datasetPath].items(): newBlock = {blockName: set()} for fileDict in fileBlock["files"]: newBlock[blockName].add(fileDict["lfn"]) blocks.append(newBlock) return blocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ logging.info("Starting injectFiles method") uninjectedFiles = self.getUninjected.execute() # filter out datatiers to be processed by RucioInjector uninjectedFiles = filterDataByTier(uninjectedFiles, self.listTiersToSkip) for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location is None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) continue for dataset in uninjectedFiles[siteName]: injectData = {} lfnList = [] injectData[dataset] = uninjectedFiles[siteName][dataset] for block in injectData[dataset]: for fileInfo in injectData[dataset][block]['files']: lfnList.append(fileInfo['lfn']) logging.info("About to inject %d files for block %s", len(injectData[dataset][block]['files']), block) self.injectFilesPhEDExCall(location, injectData, lfnList) return def injectFilesPhEDExCall(self, location, injectData, lfnList): """ _injectFilesPhEDExCall_ actual PhEDEx call for file injection """ xmlData = self.createInjectionSpec(injectData) logging.debug("injectFiles XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: # HTTPException with status 400 assumed to be duplicate injection # trigger later block recovery (investigation needed if not the case) if ex.status == 400: self.blocksToRecover.extend( self.createRecoveryFileFormat(injectData)) logging.error( "PhEDEx file injection failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: msg = "PhEDEx file injection failed with Exception: %s" % str(ex) logging.exception(msg) else: logging.debug("Injection result: %s", injectRes) if "error" in injectRes: msg = "Error injecting data %s: %s" % (injectData, injectRes["error"]) logging.error(msg) else: try: self.setStatus.execute(lfnList, 1) except Exception as ex: if 'Deadlock found' in str( ex) or 'deadlock detected' in str(ex): logging.error( "Database deadlock during file status update. Retrying again in the next cycle." ) self.blocksToRecover.extend( self.createRecoveryFileFormat(injectData)) else: msg = "Failed to update file status in the database, reason: %s" % str( ex) logging.error(msg) raise PhEDExInjectorException(msg) return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS """ logging.info("Starting closeBlocks method") migratedBlocks = self.getMigrated.execute() # filter out datatiers to be processed by RucioInjector migratedBlocks = filterDataByTier(migratedBlocks, self.listTiersToSkip) for siteName in migratedBlocks: # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location is None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) continue for dset, blocks in migratedBlocks[siteName].items(): xmlData = self.createInjectionSpec({dset: blocks}) logging.debug("closeBlocks XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: logging.error( "PhEDEx block close failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: msg = "PhEDEx block close failed with Exception: %s" % str( ex) logging.exception(msg) else: logging.debug("Block closing result: %s", injectRes) if "error" in injectRes: logging.error( "Failed to close blocks due to: %s, for data: %s", injectRes["error"], migratedBlocks[siteName][dset]) else: for blockName in blocks: logging.info("Block closed in PhEDEx: %s", blockName) self.setBlockClosed.execute(blockName) return def recoverInjectedFiles(self): """ When PhEDEx inject call timed out, run this function. Since there are 3 min reponse time out in cmsweb, some times PhEDEx injection call times out even though the call succeeded In that case run the recovery mode 1. first check whether files which injection status = 0 are in the PhEDEx. 2. if those file exist set the in_phedex status to 1 3. set self.blocksToRecover = [] Run this recovery one block at a time, with too many blocks the call to the PhEDEx data service on cmsweb can time out """ # recover one block at a time for block in self.blocksToRecover: injectedFiles = self.phedex.getInjectedFiles(block) if injectedFiles: self.setStatus.execute(injectedFiles, 1) self.blocksToRecover = [] return def deleteBlocks(self): """ _deleteBlocks_ Find deletable blocks, then decide if to delete based on: Is there an active subscription for dataset or block ? If yes => set deleted=2 If no => next check Has transfer to all destinations finished ? If yes => request block deletion, approve request, set deleted=1 If no => do nothing (check again next cycle) """ logging.info("Starting deleteBlocks method") blockDict = self.findDeletableBlocks.execute(transaction=False) if not blockDict: return ### logic to stop doing things to be done by RucioInjector or by DM team for block in list(blockDict): if not self._isDataTierAllowed(block): blockDict.pop(block) try: subscriptions = self.phedex.getSubscriptionMapping( *blockDict.keys()) except: logging.error( "Couldn't get subscription info from PhEDEx, retry next cycle") return skippableBlocks = [] deletableEntries = {} for blockName in blockDict: location = blockDict[blockName]['location'] # should never be triggered, better safe than sorry if location.endswith('_MSS'): logging.debug("Location %s for block %s is MSS, skip deletion", location, blockName) skippableBlocks.append(blockName) continue dataset = blockDict[blockName]['dataset'] sites = blockDict[blockName]['sites'] if blockName in subscriptions and location in subscriptions[ blockName]: logging.debug("Block %s subscribed to %s, skip deletion", blockName, location) binds = {'DELETED': 2, 'BLOCKNAME': blockName} self.markBlocksDeleted.execute(binds) else: blockInfo = [] try: blockInfo = self.phedex.getReplicaInfoForBlocks( block=blockName, complete='y')['phedex']['block'] except: logging.error( "Couldn't get block info from PhEDEx, retry next cycle" ) else: nodes = set() for entry in blockInfo: if entry['name'] == blockName: nodes = set([x['node'] for x in entry['replica']]) if location not in nodes: logging.debug( "Block %s not present on %s, mark as deleted", blockName, location) binds = {'DELETED': 1, 'BLOCKNAME': blockName} self.markBlocksDeleted.execute(binds) elif sites.issubset(nodes): logging.debug( "Deleting block %s from %s since it is fully transfered", blockName, location) if location not in deletableEntries: deletableEntries[location] = {} if dataset not in deletableEntries[location]: deletableEntries[location][dataset] = set() deletableEntries[location][dataset].add(blockName) binds = [] for blockName in skippableBlocks: binds.append({'DELETED': 2, 'BLOCKNAME': blockName}) if binds: self.markBlocksDeleted.execute(binds) for location in deletableEntries: chunkSize = 100 numberOfBlocks = 0 blocksToDelete = {} for dataset in deletableEntries[location]: blocksToDelete[dataset] = deletableEntries[location][dataset] numberOfBlocks += len(blocksToDelete[dataset]) if numberOfBlocks > chunkSize: self.deleteBlocksPhEDExCalls(location, blocksToDelete) numberOfBlocks = 0 blocksToDelete = {} self.deleteBlocksPhEDExCalls(location, blocksToDelete) return def deleteBlocksPhEDExCalls(self, location, blocksToDelete): """ _deleteBlocksPhEDExCalls_ actual PhEDEx calls for block deletion """ deletion = PhEDExDeletion( blocksToDelete.keys(), location, level='block', comments="WMAgent blocks auto-delete from %s" % location, blocks=blocksToDelete) try: response = self.phedex.delete(deletion) requestId = response['phedex']['request_created'][0]['id'] # auto-approve deletion request self.phedex.updateRequest(requestId, 'approve', location) except HTTPException as ex: logging.error( "PhEDEx block delete/approval failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error( "PhEDEx block delete/approval failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: binds = [] for dataset in blocksToDelete: for blockName in blocksToDelete[dataset]: binds.append({'DELETED': 1, 'BLOCKNAME': blockName}) self.markBlocksDeleted.execute(binds) return def _isDataTierAllowed(self, dataName): """ Check whether data belongs to an allowed datatier to be handled by this component (either to inject or to subscribe into PhEDEx) :param dataName: string with the block or the dataset name :return: boolean, True if the tier is allowed, False otherwise """ endTier = dataName.rsplit('/', 1)[1] endTier = endTier.split('#')[0] if '#' in endTier else endTier if endTier in self.listTiersToSkip: logging.debug( "Skipping data: %s because it's listed in the tiers to skip", dataName) return False return True def subscribeDatasets(self): """ _subscribeDatasets_ Poll the database for datasets and subscribe them. """ logging.info("Starting subscribeDatasets method") # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute() # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create a list of subscriptions as defined by the PhEDEx data structures subs = SubscriptionList() # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: ### logic to stop doing things to be done by RucioInjector or by DM team if not self._isDataTierAllowed(subInfo['path']): continue site = subInfo['site'] if site not in self.phedexNodes[ 'MSS'] and site not in self.phedexNodes['Disk']: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping subscription: %s" % subInfo['id'] logging.error(msg) continue # Avoid custodial subscriptions to disk nodes if site not in self.phedexNodes['MSS']: subInfo['custodial'] = 'n' # Avoid auto approval in T1 sites elif site.startswith("T1"): subInfo['request_only'] = 'y' phedexSub = PhEDExSubscription( subInfo['path'], site, subInfo['phedex_group'], priority=subInfo['priority'], move=subInfo['move'], custodial=subInfo['custodial'], request_only=subInfo['request_only'], subscriptionId=subInfo['id']) # Check if the subscription is a duplicate if phedexSub.matchesExistingSubscription(self.phedex) or \ phedexSub.matchesExistingTransferRequest(self.phedex): subscriptionsMade.append(subInfo['id']) continue # Add it to the list subs.addSubscription(phedexSub) # Compact the subscriptions subs.compact() for subscription in subs.getSubscriptionList(): logging.info( "Subscribing: %s to %s, with options: Move: %s, Custodial: %s, Request Only: %s", subscription.getDatasetPaths(), subscription.getNodes(), subscription.move, subscription.custodial, subscription.request_only) try: self.phedex.subscribe(subscription) except HTTPException as ex: logging.error( "PhEDEx dataset subscribe failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error( "PhEDEx dataset subscribe failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: subscriptionsMade.extend(subscription.getSubscriptionIds()) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade) return def subscribeBlocks(self): """ _subscribeBlocks_ Poll the database and subscribe blocks not yet subscribed. """ logging.info("Starting subscribeBlocks method") unsubBlocks = self.getUnsubscribedBlocks.execute() # now organize those by location in order to minimize phedex requests # also remove blocks that this component is meant to skip unsubBlocks = self.organizeBlocksByLocation(unsubBlocks) for location, blockDict in unsubBlocks.items(): phedexSub = PhEDExSubscription(blockDict.keys(), location, self.phedexGroup, blocks=blockDict, level="block", priority="normal", move="n", custodial="n", request_only="n", comments="WMAgent production site") try: res = self.phedex.subscribe(phedexSub) transferId = res['phedex']['request_created'][0]['id'] logging.info( "Subscribed %d blocks for %d datasets, to location: %s, under request ID: %s", len(phedexSub.getBlocks()), len(phedexSub.getDatasetPaths()), phedexSub.getNodes(), transferId) except HTTPException as ex: logging.error( "PhEDEx block subscription failed with HTTPException: %s %s", ex.status, ex.result) logging.error("The subscription object was: %s", str(phedexSub)) except Exception as ex: logging.exception( "PhEDEx block subscription failed with Exception: %s", str(ex)) else: binds = [] for blockname in phedexSub.getBlocks(): binds.append({ 'RULE_ID': str(transferId), 'BLOCKNAME': blockname }) self.setBlockRules.execute(binds) return def organizeBlocksByLocation(self, blocksLocation): """ Given a list of dictionaries (with block name and location). Organize those blocks per location to make phedex subscription calls more efficient. Also drops blocks that we cannot subscribe, and check for valid phedex node names. :param blocksLocation: list of dictionaries :return: a dict of dictionaries, such as: {"locationA": {"datasetA": ["blockA", "blockB", ...], "datasetB": ["blockA", "blockB", ...] }, "locationB": {"datasetA": ["blockA"], ... """ dictByLocation = {} for item in blocksLocation: ### logic to stop doing things to be done by RucioInjector or by DM team if not self._isDataTierAllowed(item['blockname']): continue site = item['pnn'] if site not in self.phedexNodes[ 'MSS'] and site not in self.phedexNodes['Disk']: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping block subscription for: %s" % item['blockname'] logging.error(msg) continue dictByLocation.setdefault(site, {}) dsetName = item['blockname'].split("#")[0] dictByLocation[site].setdefault(dsetName, []) dictByLocation[site][dsetName].append(item['blockname']) return dictByLocation
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets to MSS as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUnsubscribed = daofactory(classname = "GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname = "MarkDatasetSubscribed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if not self.seMap.has_key(node["kind"]): self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def algorithm(self, parameters): """ _algorithm_ Poll the database for datasets and subscribe them to MSS. """ myThread = threading.currentThread() myThread.transaction.begin() if not self.seMap.has_key("MSS"): return unsubscribedDatasets = self.getUnsubscribed.execute(conn = myThread.transaction.conn, transaction = True) datasetMap = {} for unsubscribedDataset in unsubscribedDatasets: datasetPath = unsubscribedDataset["path"] seName = unsubscribedDataset["se_name"] if not self.seMap["MSS"].has_key(seName): msg = "No MSS node for SE: %s" % seName logging.error(msg) self.sendAlert(7, msg = msg) continue if not datasetMap.has_key(self.seMap["MSS"][seName]): datasetMap[self.seMap["MSS"][seName]] = [] datasetMap[self.seMap["MSS"][seName]].append(datasetPath) self.markSubscribed.execute(datasetPath, conn = myThread.transaction.conn, transaction = True) for siteMSS in datasetMap.keys(): logging.info("Subscribing %s to %s" % (datasetMap[siteMSS], siteMSS)) newSubscription = PhEDExSubscription(datasetMap[siteMSS], siteMSS, self.group, custodial = "y", requestOnly = "y") xmlData = XMLDrop.makePhEDExXMLForDatasets(self.dbsUrl, newSubscription.getDatasetPaths()) print xmlData self.phedex.subscribe(newSubscription, xmlData) myThread.transaction.commit() return
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory( package="WMComponent.PhEDExInjector.Database", logger=self.logger, dbinterface=myThread.dbi ) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") daofactory = DAOFactory(package="WMComponent.DBSBuffer.Database", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") daofactory = DAOFactory(package="WMComponent.DBSUpload.Database", logger=self.logger, dbinterface=myThread.dbi) self.setBlockStatus = daofactory(classname="SetBlockStatus") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if not self.seMap.has_key(node["kind"]): self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Trasform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for file in fileBlock["files"]: blockSpec.addFile(file["lfn"], file["checksum"], file["size"]) return injectionSpec.save() def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ myThread = threading.currentThread() uninjectedFiles = self.getUninjected.execute() injectedFiles = [] for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if self.seMap.has_key("Buffer") and self.seMap["Buffer"].has_key(siteName): location = self.seMap["Buffer"][siteName] elif self.seMap.has_key("MSS") and self.seMap["MSS"].has_key(siteName): location = self.seMap["MSS"][siteName] elif self.seMap.has_key("Disk") and self.seMap["Disk"].has_key(siteName): location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg=msg) continue xmlData = self.createInjectionSpec(uninjectedFiles[siteName]) injectRes = self.phedex.injectBlocks(location, xmlData, 0, 0) if not injectRes.has_key("error"): for datasetName in uninjectedFiles[siteName]: for blockName in uninjectedFiles[siteName][datasetName]: for file in uninjectedFiles[siteName][datasetName][blockName]["files"]: injectedFiles.append(file["lfn"]) else: msg = "Error injecting data %s: %s" % (uninjectedFiles[siteName], injectRes["error"]) logging.error(msg) self.sendAlert(6, msg=msg) if len(injectedFiles) > 0: logging.debug("Injecting files: %s" % injectedFiles) self.setStatus.execute(injectedFiles, 1, conn=myThread.transaction.conn, transaction=myThread.transaction) return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS. """ myThread = threading.currentThread() migratedBlocks = self.getMigrated.execute() closedBlocks = [] for siteName in migratedBlocks.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if self.seMap.has_key("Buffer") and self.seMap["Buffer"].has_key(siteName): location = self.seMap["Buffer"][siteName] elif self.seMap.has_key("MSS") and self.seMap["MSS"].has_key(siteName): location = self.seMap["MSS"][siteName] elif self.seMap.has_key("Disk") and self.seMap["Disk"].has_key(siteName): location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(6, msg=msg) continue xmlData = self.createInjectionSpec(migratedBlocks[siteName]) injectRes = self.phedex.injectBlocks(location, xmlData, 0, 0) if not injectRes.has_key("error"): for datasetName in migratedBlocks[siteName]: for blockName in migratedBlocks[siteName][datasetName]: closedBlocks.append(blockName) else: msg = "Error injecting data %s: %s" % (migratedBlocks[siteName], injectRes["error"]) logging.error(msg) self.sendAlert(6, msg=msg) for closedBlock in closedBlocks: logging.debug("Closing block %s" % closedBlock) self.setBlockStatus.execute( closedBlock, locations=None, open_status="Closed", conn=myThread.transaction.conn, transaction=myThread.transaction, ) return def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ myThread = threading.currentThread() myThread.transaction.begin() self.injectFiles() self.closeBlocks() myThread.transaction.commit() return
class RequestQuery: def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json") self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/") def __del__(self): self.br.close() def getScramArchByCMSSW(self): """ Get from the list of available CMSSW releases return a dictionary of ScramArchitecture by CMSSW """ # Set temporary conection to the server and get the response from cmstags url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML' br = Browser() br.set_handle_robots(False) response=br.open(url) soup = BeautifulSoup(response.read()) # Dictionary form # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... } archByCmssw={} # Fill the dictionary for arch in soup.find_all('architecture'): for cmssw in arch.find_all('project'): # CMSSW release cmsswLabel = cmssw.get('label').encode('ascii', 'ignore') if cmsswLabel not in archByCmssw: archByCmssw[cmsswLabel]=[] # ScramArch related to this CMSSW release archName = arch.get('name').encode('ascii', 'ignore') archByCmssw[cmsswLabel].append(archName) return archByCmssw def getDatasetOriginSites(self, dbs_url, data): """ Get the origin sites for each block of the dataset. Return a list block origin sites. """ sites=[] local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listBlocks(detail=True,dataset=data) elif local_dbs == 'phys02': response = self.dbsPhys02.listBlocks(detail=True,dataset=data) elif local_dbs == 'phys03': response = self.dbsPhys03.listBlocks(detail=True,dataset=data) seList = [] for block in response: if block['origin_site_name'] not in seList: seList.append(block['origin_site_name']) siteNames = [] for node in self.nodeMappings['phedex']['node']: if node['se'] in seList: siteNames.append(node['name']) return siteNames, seList def phEDExNodetocmsName(self, nodeList): """ Convert PhEDEx node name list to cms names list """ names = [] for node in nodeList: name = node.replace('_MSS', '').replace('_Disk', '').replace('_Buffer', '').replace('_Export', '') if name not in names: names.append(name) return names def setGlobalTagFromOrigin(self, dbs_url,input_dataset): """ Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN' """ globalTag = "" local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset) elif local_dbs == 'phys02': response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset) elif local_dbs == 'phys03': response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset) globalTag = response[0]['global_tag'] # GlobalTag cannot be empty if globalTag == '': globalTag = 'UNKNOWN' return globalTag def isDataAtUrl(self, dbs_url,input_dataset): """ Returns True if the dataset is at the dbs url, if not returns False """ local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listDatasets(dataset=input_dataset) elif local_dbs == 'phys02': response = self.dbsPhys02.listDatasets(dataset=input_dataset) elif local_dbs == 'phys03': response = self.dbsPhys03.listDatasets(dataset=input_dataset) # This means that the dataset is not at the url if not response: return False else: return True def getLabelByValueDict(self, control): """ From control items, create a dictionary by values """ d = {} for item in control.items: value = item.attrs['value'] label = item.attrs['label'] d[value] = label return d def getValueByLabelDict(self, control): """ From control items, create a dictionary by labels """ d = {} for item in control.items: value = item.attrs['value'] label = item.attrs['label'] d[label] = value return d def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version = 1): """ Creates a JSON file 'Ticket_#TICKET.json' with the needed information for creating a requeston ReqMgr. Input: - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773 - input_dataset - dbs_url: only the instance name, For example: "phys01" for https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader - cmssw_release - group_name: the physics group name - version: the dataset version, 1 by default. It returns a dictionary that contains the request information. """ scramArchByCMSSW = self.getScramArchByCMSSW() self.nodeMappings = self.phedex.getNodeMap() task = ticket print "Processing ticket: %s" % task #splitting input dataset input_primary_dataset = input_dataset.split('/')[1].replace(' ','') input_processed_dataset = input_dataset.split('/')[2].replace(' ','') data_tier = input_dataset.split('/')[3].replace(' ','') # Transform input value to a valid DBS url #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader" dbs_url = dbs_base_url+dbs_url+"/DBSReader" release_id = cmssw_release # check if deprecated release was used release = cmssw_release # check if release has not ScramArch match if release not in scramArchByCMSSW: raise Exception("Error on ticket %s due to ScramArch mismatch" % task) else: scram_arch = scramArchByCMSSW[release][-1] # check if dataset is not at dbs url try: data_at_url = self.isDataAtUrl(dbs_url,input_dataset) except: raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url)) if not data_at_url: raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url)) ## Get Physics Group group_squad = 'cms-storeresults-'+group_name.replace("-","_").lower() ## Get Dataset Version dataset_version = str(version) # Set default Adquisition Era for StoreResults acquisitionEra = "StoreResults" ## Construction of the new dataset name (ProcessingString) ## remove leading hypernews or physics group name and StoreResults+Version if input_processed_dataset.find(group_name)==0: new_dataset = input_processed_dataset.replace(group_name,"",1) else: stripped_dataset = input_processed_dataset.split("-")[1:] new_dataset = '_'.join(stripped_dataset) # Get dataset site info: phedex_map, se_names = self.getDatasetOriginSites(dbs_url,input_dataset) sites = self.phEDExNodetocmsName(phedex_map) infoDict = {} # Build store results json # First add all the defaults values infoDict["RequestType"] = "StoreResults" infoDict["UnmergedLFNBase"] = "/store/unmerged" infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-","_").lower() infoDict["MinMergeSize"] = 1500000000 infoDict["MaxMergeSize"] = 5000000000 infoDict["MaxMergeEvents"] = 100000 infoDict["TimePerEvent"] = 40 infoDict["SizePerEvent"] = 512.0 infoDict["Memory"] = 2394 infoDict["CmsPath"] = "/uscmst1/prod/sw/cms" infoDict["Group"] = "DATAOPS" infoDict["DbsUrl"] = dbs_url # Add all the information pulled from Savannah infoDict["AcquisitionEra"] = acquisitionEra infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset) infoDict["DataTier"] = data_tier infoDict["InputDataset"] = input_dataset infoDict["ProcessingString"] = new_dataset infoDict["CMSSWVersion"] = release infoDict["ScramArch"] = scram_arch infoDict["ProcessingVersion"] = dataset_version infoDict["SiteWhitelist"] = list(sites) # Create report for Migration2Global report = {} #Fill json file, if status is done self.writeJSONFile(task, infoDict) report["json"] = 'y' report["task"] = int(task) report["InputDataset"] = input_dataset report["ProcessingString"] = new_dataset report["localUrl"] = dbs_url report["sites"] = list(sites) report["se_names"] = list(se_names) return report def writeJSONFile(self, task, infoDict): """ This writes a JSON file at ComponentDir """ ##check if file already exists filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json' if not os.access(filename,os.F_OK): jsonfile = open(filename,'w') request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING jsonfile.write(json.dumps(request,sort_keys=True, indent=4)) jsonfile.close return def removeJSONFile(self,task): """ This removes the JSON file at ComponentDir if it was created """ filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json' if os.access(filename,os.F_OK): os.remove(filename) return def printReport(self, report): """ Print out a report """ print "%20s %5s %10s %50s %50s" %( 'Ticket','json','local DBS','Sites','se_names') print "%20s %5s %10s %50s %50s" %( '-'*20,'-'*5,'-'*10,'-'*50,'-'*50 ) json = report["json"] ticket = report["task"] #status = report["ticketStatus"] localUrl = report["localUrl"].split('/')[5] site = ', '.join(report["sites"]) se_names = ', '.join(report["se_names"]) print "%20s %5s %10s %50s %50s" %(ticket,json,localUrl,site,se_names)
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.siteDB = SiteDBJSON() self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # We will map node names to CMS names, that what the spec will have. # If a CMS name is associated to many PhEDEx node then choose the MSS option self.cmsToPhedexMap = {} self.phedexNodes = {'MSS':[], 'Disk':[]} # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUnsubscribed = daofactory(classname = "GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname = "MarkDatasetSubscribed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: cmsName = self.siteDB.phEDExNodetocmsName(node["name"]) if cmsName not in self.cmsToPhedexMap: self.cmsToPhedexMap[cmsName] = {} logging.info("Loaded PhEDEx node %s for site %s" % (node["name"], cmsName)) if node["kind"] not in self.cmsToPhedexMap[cmsName]: self.cmsToPhedexMap[cmsName][node["kind"]] = node["name"] if node["kind"] in [ "MSS", "Disk" ]: self.phedexNodes[node["kind"]].append(node["name"]) return def algorithm(self, parameters): """ _algorithm_ Run the subscription algorithm as configured """ self.subscribeDatasets() return def subscribeDatasets(self): """ _subscribeDatasets_ Poll the database for datasets and subscribe them. """ myThread = threading.currentThread() myThread.transaction.begin() # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute(conn = myThread.transaction.conn, transaction = True) # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create a list of subscriptions as defined by the PhEDEx data structures subs = SubscriptionList() # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: site = subInfo['site'] if site not in self.phedexNodes['MSS'] and site not in self.phedexNodes['Disk']: if site not in self.cmsToPhedexMap: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping subscription: %s" % subInfo['id'] logging.error(msg) self.sendAlert(7, msg = msg) continue # Get the phedex node from CMS site site = self.cmsToPhedexMap[site].get("MSS") or self.cmsToPhedexMap[site]["Disk"] # Avoid custodial subscriptions to disk nodes if site not in self.phedexNodes['MSS']: subInfo['custodial'] = 'n' # Avoid auto approval in T1 sites elif site.startswith("T1"): subInfo['request_only'] = 'y' phedexSub = PhEDExSubscription(subInfo['path'], site, self.group, priority = subInfo['priority'], move = subInfo['move'], custodial = subInfo['custodial'], request_only = subInfo['request_only'], subscriptionId = subInfo['id']) # Check if the subscription is a duplicate if phedexSub.matchesExistingSubscription(self.phedex) or \ phedexSub.matchesExistingTransferRequest(self.phedex): subscriptionsMade.append(subInfo['id']) continue # Add it to the list subs.addSubscription(phedexSub) # Compact the subscriptions subs.compact() for subscription in subs.getSubscriptionList(): try: xmlData = XMLDrop.makePhEDExXMLForDatasets(self.dbsUrl, subscription.getDatasetPaths()) logging.debug(str(xmlData)) msg = "Subscribing: %s to %s, with options: " % (subscription.getDatasetPaths(), subscription.getNodes()) msg += "Move: %s, Custodial: %s, Request Only: %s" % (subscription.move, subscription.custodial, subscription.request_only) logging.info(msg) self.phedex.subscribe(subscription, xmlData) except Exception as ex: logging.error("Something went wrong when communicating with PhEDEx, will try again later.") logging.error("Exception: %s" % str(ex)) else: subscriptionsMade.extend(subscription.getSubscriptionIds()) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade, conn = myThread.transaction.conn, transaction = True) myThread.transaction.commit() return
class RequestQuery: def __init__(self,config): self.br=Browser() self.config = config # Initialise connections self.mySiteDB = SiteDBJSON() self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json") self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/") self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/") self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/") def __del__(self): self.br.close() def login2Savannah(self): """ login2Savannah log into savannah with the given parameters in the config (username and password) User must have admin priviledges for store results requests """ login_page='https://savannah.cern.ch/account/login.php?uri=%2F' savannah_page='https://savannah.cern.ch/task/?group=cms-storeresults' self.br.open(login_page) ## 'Search' form is form 0 ## login form is form 1 self.br.select_form(nr=1) username = self.config["SavannahUser"] self.br['form_loginname']=username self.br['form_pw']=self.config["SavannahPasswd"] self.br.submit() response = self.br.open(savannah_page) # Check to see if login was successful if not re.search('Logged in as ' + username, response.read()): print('login unsuccessful, please check your username and password') return False else: return True def selectQueryForm(self,**kargs): """ selectQueryForm create the browser view to get all the store result tickets from savannah """ if self.isLoggedIn: self.br.select_form(name="bug_form") ## Use right query form labelled Test control = self.br.find_control("report_id",type="select") for item in control.items: if item.attrs['label'] == "Test": control.value = [item.attrs['value']] ##select number of entries displayed per page control = self.br.find_control("chunksz",type="text") control.value = "150" ##check additional searching parameter for arg in kargs: if arg == "approval_status": control = self.br.find_control("resolution_id",type="select") for item in control.items: if item.attrs['label'] == kargs[arg].strip(): control.value = [item.attrs['value']] elif arg == "task_status": control = self.br.find_control("status_id",type="select") for item in control.items: if item.attrs['label'] == kargs[arg].strip(): control.value = [item.attrs['value']] elif arg == "team": control = self.br.find_control("custom_sb5",type="select") for item in control.items: if item.attrs['label'] == kargs[arg].strip(): control.value = [item.attrs['value']] response = self.br.submit() response.read() return def getScramArchByCMSSW(self): """ Get from the list of available CMSSW releases return a dictionary of ScramArchitecture by CMSSW """ # Set temporary conection to the server and get the response from cmstags url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML' br = Browser() br.set_handle_robots(False) response=br.open(url) soup = BeautifulSoup(response.read()) # Dictionary form # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... } archByCmssw={} # Fill the dictionary for arch in soup.find_all('architecture'): for cmssw in arch.find_all('project'): # CMSSW release cmsswLabel = cmssw.get('label').encode('ascii', 'ignore') if cmsswLabel not in archByCmssw: archByCmssw[cmsswLabel]=[] # ScramArch related to this CMSSW release archName = arch.get('name').encode('ascii', 'ignore') archByCmssw[cmsswLabel].append(archName) return archByCmssw def createValueDicts(self): """ Init dictionaries by value/label: - Releases by Value - Physics group by value - DBS url by value - DBS rul by label - Status of savannah request by value - Status of savannah ticket by value (Open/Closed/Any) """ if self.isLoggedIn: self.br.select_form(name="bug_form") control = self.br.find_control("custom_sb2",type="select") self.ReleaseByValueDict = self.getLabelByValueDict(control) control = self.br.find_control("custom_sb3",type="select") self.GroupByValueDict = self.getLabelByValueDict(control) control = self.br.find_control("custom_sb4",type="select") self.DBSByValueDict = self.getLabelByValueDict(control) self.DBSByLabelDict = self.getValueByLabelDict(control) control = self.br.find_control("resolution_id",type="select") self.StatusByValueDict = self.getLabelByValueDict(control) control = self.br.find_control("status_id",type="select") self.TicketStatusByLabelDict = self.getValueByLabelDict(control) return def getDatasetOriginSites(self, dbs_url, data): """ Get the origin sites for each block of the dataset. Return a list block origin sites. """ local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listBlocks(detail=True,dataset=data) elif local_dbs == 'phys02': response = self.dbsPhys02.listBlocks(detail=True,dataset=data) elif local_dbs == 'phys03': response = self.dbsPhys03.listBlocks(detail=True,dataset=data) pnnList = set() for block in response: pnnList.add(block['origin_site_name']) psnList = self.mySiteDB.PNNstoPSNs(pnnList) return psnList, list(pnnList) def phEDExNodetocmsName(self, nodeList): """ Convert PhEDEx node name list to cms names list """ names = [] for node in nodeList: name = node.replace('_MSS', '').replace('_Disk', '').replace('_Buffer', '').replace('_Export', '') if name not in names: names.append(name) return names def setGlobalTagFromOrigin(self, dbs_url,input_dataset): """ Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN' """ globalTag = "" local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset) elif local_dbs == 'phys02': response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset) elif local_dbs == 'phys03': response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset) globalTag = response[0]['global_tag'] # GlobalTag cannot be empty if globalTag == '': globalTag = 'UNKNOWN' return globalTag def isDataAtUrl(self, dbs_url,input_dataset): """ Returns True if the dataset is at the dbs url, if not returns False """ local_dbs = dbs_url.split('/')[5] if local_dbs == 'phys01': response = self.dbsPhys01.listDatasets(dataset=input_dataset) elif local_dbs == 'phys02': response = self.dbsPhys02.listDatasets(dataset=input_dataset) elif local_dbs == 'phys03': response = self.dbsPhys03.listDatasets(dataset=input_dataset) # This means that the dataset is not at the url if not response: return False else: return True def getLabelByValueDict(self, control): """ From control items, create a dictionary by values """ d = {} for item in control.items: value = item.attrs['value'] label = item.attrs['label'] d[value] = label return d def getValueByLabelDict(self, control): """ From control items, create a dictionary by labels """ d = {} for item in control.items: value = item.attrs['value'] label = item.attrs['label'] d[label] = value return d def getRequests(self,**kargs): """ getRequests Actually goes through all the savannah requests and create json files if the ticket is not Closed and the status of the item is Done. It also reports back the summary of the requests in savannah """ requests = [] # Open Browser and login into Savannah self.br=Browser() self.isLoggedIn = self.login2Savannah() if self.isLoggedIn: if not kargs: self.selectQueryForm(approval_status='1',task_status='0') else: self.selectQueryForm(**kargs) self.createValueDicts() self.br.select_form(name="bug_form") response = self.br.submit() html_ouput = response.read() scramArchByCMSSW = self.getScramArchByCMSSW() self.nodeMappings = self.phedex.getNodeMap() for link in self.br.links(text_regex="#[0-9]+"): response = self.br.follow_link(link) try: ## Get Information self.br.select_form(name="item_form") ## remove leading   and # from task task = link.text.replace('#','').decode('utf-8').strip() print("Processing ticket: %s" % task) ## Get input dataset name control = self.br.find_control("custom_tf1",type="text") input_dataset = control.value input_primary_dataset = input_dataset.split('/')[1].replace(' ','') input_processed_dataset = input_dataset.split('/')[2].replace(' ','') data_tier = input_dataset.split('/')[3].replace(' ','') ## Get DBS URL by Drop Down control = self.br.find_control("custom_sb4",type="select") dbs_url = self.DBSByValueDict[control.value[0]] ## Get DBS URL by text field (for old entries) if dbs_url=='None': control = self.br.find_control("custom_tf4",type="text") dbs_url = control.value.replace(' ','') else: # Transform input value to a valid DBS url #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader" dbs_url = dbs_base_url+dbs_url+"/DBSReader" ## Get Release control = self.br.find_control("custom_sb2",type="select") release_id = control.value ## Get current request status control = self.br.find_control("status_id",type="select") request_status_id = control.value RequestStatusByValueDict = self.getLabelByValueDict(control) # close the request if deprecated release was used try: release = self.ReleaseByValueDict[release_id[0]] except: if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed": msg = "Your request is not valid anymore, since the given CMSSW release is deprecated. If your request should be still processed, please reopen the request and update the CMSSW release to a more recent *working* release.\n" msg+= "\n" msg+= "Thanks,\n" msg+= "Your StoreResults team" self.closeRequest(task,msg) self.br.back() print("I tried to Close ticket %s due to CMSSW not valid" % task) continue # close the request if release has not ScramArch match if release not in scramArchByCMSSW: if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed": msg = "Your request is not valid, there is no ScramArch match for the given CMSSW release.\n" msg+= "If your request should be still processed, please reopen the request and update the CMSSW release according to: https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML \n" msg+= "\n" msg+= "Thanks,\n" msg+= "Your StoreResults team" self.closeRequest(task,msg) self.br.back() print("I tried to Close ticket %s due to ScramArch mismatch" % task) continue else: index=len(scramArchByCMSSW[release]) scram_arch = scramArchByCMSSW[release][index-1] # close the request if dataset is not at dbs url try: data_at_url = self.isDataAtUrl(dbs_url,input_dataset) except: print('I got an error trying to look for dataset %s at %s, please look at this ticket: %s' %(input_dataset,dbs_url,task)) continue if not data_at_url: msg = "Your request is not valid, I could not find the given dataset at %s\n" % dbs_url msg+= "If your request should be still processed, please reopen the request and change DBS url properly \n" msg+= "\n" msg+= "Thanks,\n" msg+= "Your StoreResults team" self.closeRequest(task,msg) self.br.back() print("I tried to Close ticket %s, dataset is not at DBS url" % task) continue # Avoid not approved Tickets #if not RequestStatusByValueDict[request_status_id[0]] == "Done": # continue ## Get Physics Group control = self.br.find_control("custom_sb3",type="select") group_id = control.value[0] group_squad = 'cms-storeresults-'+self.GroupByValueDict[group_id].replace("-","_").lower() ## Get Dataset Version control = self.br.find_control("custom_tf3",type="text") dataset_version = control.value.replace(' ','') if dataset_version == "": dataset_version = '1' ## Get current status control = self.br.find_control("resolution_id",type="select") status_id = control.value ## Get assigned to control = self.br.find_control("assigned_to",type="select") AssignedToByValueDict = self.getLabelByValueDict(control) assignedTo_id = control.value ##Assign task to the physics group squad if AssignedToByValueDict[assignedTo_id[0]]!=group_squad: assignedTo_id = [self.getValueByLabelDict(control)[group_squad]] control.value = assignedTo_id self.br.submit() # Set default Adquisition Era for StoreResults acquisitionEra = "StoreResults" ## Construction of the new dataset name (ProcessingString) ## remove leading hypernews or physics group name and StoreResults+Version if input_processed_dataset.find(self.GroupByValueDict[group_id])==0: new_dataset = input_processed_dataset.replace(self.GroupByValueDict[group_id],"",1) else: stripped_dataset = input_processed_dataset.split("-")[1:] new_dataset = '_'.join(stripped_dataset) except Exception as ex: self.br.back() print("There is a problem with this ticket %s, please have a look to the error:" % task) print(str(ex)) print(traceback.format_exc()) continue self.br.back() # Get dataset site info: psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset) infoDict = {} # Build store results json # First add all the defaults values infoDict["RequestType"] = "StoreResults" infoDict["UnmergedLFNBase"] = "/store/unmerged" infoDict["MergedLFNBase"] = "/store/results/" + self.GroupByValueDict[group_id].replace("-","_").lower() infoDict["MinMergeSize"] = 1500000000 infoDict["MaxMergeSize"] = 5000000000 infoDict["MaxMergeEvents"] = 100000 infoDict["TimePerEvent"] = 40 infoDict["SizePerEvent"] = 512.0 infoDict["Memory"] = 2394 infoDict["CmsPath"] = "/uscmst1/prod/sw/cms" infoDict["Group"] = "DATAOPS" infoDict["DbsUrl"] = dbs_url # Add all the information pulled from Savannah infoDict["AcquisitionEra"] = acquisitionEra infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url,input_dataset) infoDict["DataTier"] = data_tier infoDict["InputDataset"] = input_dataset infoDict["ProcessingString"] = new_dataset infoDict["CMSSWVersion"] = release infoDict["ScramArch"] = scram_arch infoDict["ProcessingVersion"] = dataset_version infoDict["SiteWhitelist"] = psnList # Create report for Migration2Global report = {} #Fill json file, if status is done if self.StatusByValueDict[status_id[0]]=='Done' and RequestStatusByValueDict[request_status_id[0]] != "Closed": self.writeJSONFile(task, infoDict) report["json"] = 'y' else: report["json"] = 'n' report["task"] = int(task) report["InputDataset"] = input_dataset report["ProcessingString"] = new_dataset report["ticketStatus"] = self.StatusByValueDict[status_id[0]] report["assignedTo"] = AssignedToByValueDict[assignedTo_id[0]] report["localUrl"] = dbs_url report["sites"] = psnList report["pnns"] = pnnList # if the request is closed, change the item status to report to Closed if report["ticketStatus"] == "Done" and RequestStatusByValueDict[request_status_id[0]] == "Closed": report["ticketStatus"] = "Closed" requests.append(report) # Print out report self.printReport(requests) # Close connections self.br.close() return requests def closeRequest(self,task,msg): """ This close a specific savannag ticket Insert a message in the ticket """ if self.isLoggedIn: #self.createValueDicts() response = self.br.open('https://savannah.cern.ch/task/?'+str(task)) html = response.read() self.br.select_form(name="item_form") control = self.br.find_control("status_id",type="select") control.value = [self.TicketStatusByLabelDict["Closed"]] #Put reason to the comment field control = self.br.find_control("comment",type="textarea") control.value = msg #DBS Drop Down is a mandatory field, if set to None (for old requests), it is not possible to close the request self.setDBSDropDown() self.br.submit() #remove JSON ticket self.removeJSONFile(task) self.br.back() return def setDBSDropDown(self): ## Get DBS URL by Drop Down control = self.br.find_control("custom_sb4",type="select") dbs_url = self.DBSByValueDict[control.value[0]] ## Get DBS URL by text field (for old entries) if dbs_url=='None': tmp = self.br.find_control("custom_tf4",type="text") dbs_url = tmp.value.replace(' ','') if dbs_url.find("phys01")!=-1: control.value = [self.DBSByLabelDict["phys01"]] elif dbs_url.find("phys02")!=-1: control.value = [self.DBSByLabelDict["phys02"]] elif dbs_url.find("phys03")!=-1: control.value = [self.DBSByLabelDict["phys03"]] else: msg = 'DBS URL of the old request is neither phys01, phys02 nor phys03. Please, check!' print(msg) raise RuntimeError(msg) return def writeJSONFile(self, task, infoDict): """ This writes a JSON file at ComponentDir """ ##check if file already exists filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json' if not os.access(filename,os.F_OK): jsonfile = open(filename,'w') request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING jsonfile.write(json.dumps(request,sort_keys=True, indent=4)) jsonfile.close return def removeJSONFile(self,task): """ This removes the JSON file at ComponentDir if it was created """ filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json' if os.access(filename,os.F_OK): os.remove(filename) return def printReport(self, requests): """ Print out a report """ print("%20s %10s %5s %35s %10s %50s %50s" %( 'Savannah Ticket','Status','json','Assigned to','local DBS','Sites','pnns')) print("%20s %10s %5s %35s %10s %50s %50s" %( '-'*20,'-'*10,'-'*5,'-'*35,'-'*10,'-'*50,'-'*50 )) for report in requests: json = report["json"] ticket = report["task"] status = report["ticketStatus"] assigned = report["assignedTo"] localUrl = report["localUrl"].split('/')[5] site = ', '.join(report["sites"]) pnns = ', '.join(report["pnns"]) print("%20s %10s %5s %35s %10s %50s %50s" %(ticket,status,json,assigned,localUrl,site,pnns))
class PhEDExInjectorSubscriber(BaseWorkerThread): """ _PhEDExInjectorSubscriber_ Poll the DBSBuffer database and subscribe datasets as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.siteDB = SiteDBJSON() self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") self.tier0Mode = getattr(config.PhEDExInjector, "tier0Mode", False) # We will map node names to CMS names, that what the spec will have. # If a CMS name is associated to many PhEDEx node then choose the MSS option self.cmsToPhedexMap = {} self.phedexNodes = {'MSS':[], 'Disk':[]} # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUnsubscribed = daofactory(classname = "GetUnsubscribedDatasets") self.getUnsubscribedBlocks = daofactory(classname = "GetUnsubscribedBlocks") self.markSubscribed = daofactory(classname = "MarkDatasetSubscribed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: cmsName = self.siteDB.phEDExNodetocmsName(node["name"]) if cmsName not in self.cmsToPhedexMap: self.cmsToPhedexMap[cmsName] = {} logging.info("Loaded PhEDEx node %s for site %s" % (node["name"], cmsName)) if node["kind"] not in self.cmsToPhedexMap[cmsName]: self.cmsToPhedexMap[cmsName][node["kind"]] = node["name"] if node["kind"] in [ "MSS", "Disk" ]: self.phedexNodes[node["kind"]].append(node["name"]) return def algorithm(self, parameters): """ _algorithm_ Run the subscription algorithm as configured """ if self.tier0Mode: self.subscribeTier0Blocks() self.subscribeDatasets() return def subscribeTier0Blocks(self): """ _subscribeTier0Blocks_ Subscribe blocks to the Tier-0 where a replica subscription already exists. All Tier-0 subscriptions are move, custodial and autoapproved with high priority. """ myThread = threading.currentThread() myThread.transaction.begin() # Check for candidate blocks for subscription blocksToSubscribe = self.getUnsubscribedBlocks.execute(node = 'T0_CH_CERN', conn = myThread.transaction.conn, transaction = True) if not blocksToSubscribe: return # For the blocks we don't really care about the subscription options # We are subscribing all blocks with the same recipe. subscriptionMap = {} for subInfo in blocksToSubscribe: dataset = subInfo['path'] if dataset not in subscriptionMap: subscriptionMap[dataset] = [] subscriptionMap[dataset].append(subInfo['blockname']) site = 'T0_CH_CERN' custodial = 'y' request_only = 'n' move = 'y' priority = 'High' # Get the phedex node phedexNode = self.cmsToPhedexMap[site]["MSS"] logging.error("Subscribing %d blocks, from %d datasets to the Tier-0" % (len(subscriptionMap), sum([len(x) for x in subscriptionMap.values()]))) newSubscription = PhEDExSubscription(subscriptionMap.keys(), phedexNode, self.group, custodial = custodial, request_only = request_only, move = move, priority = priority, level = 'block', blocks = subscriptionMap) # TODO: Check for blocks already subscribed try: xmlData = XMLDrop.makePhEDExXMLForBlocks(self.dbsUrl, newSubscription.getDatasetsAndBlocks()) logging.debug(str(xmlData)) self.phedex.subscribe(newSubscription, xmlData) except Exception, ex: logging.error("Something went wrong when communicating with PhEDEx, will try again later.") logging.error("Exception: %s" % str(ex))
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] self.diskSites = getattr(config.PhEDExInjector, "diskSites", ["storm-fe-cms.cr.cnaf.infn.it", "srm-cms-disk.gridpp.rl.ac.uk"]) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") self.filesToRecover = None def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUninjected = daofactory(classname = "GetUninjectedFiles") self.getMigrated = daofactory(classname = "GetMigratedBlocks") daofactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = self.logger, dbinterface = myThread.dbi) self.setStatus = daofactory(classname = "DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname = "SetBlockClosed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if node["kind"] not in self.seMap: self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for file in fileBlock["files"]: blockSpec.addFile(file["lfn"], file["checksum"], file["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns {"block1": set(["lfn1", "lfn2"])} """ sortedBlocks = defaultdict(set) for datasetPath in unInjectedData: for fileBlockName, fileBlock in unInjectedData[datasetPath].iteritems(): for fileDict in fileBlock["files"]: sortedBlocks[fileBlockName].add(fileDict["lfn"]) return sortedBlocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ myThread = threading.currentThread() uninjectedFiles = self.getUninjected.execute() injectedFiles = [] for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if siteName in self.diskSites: if "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] elif "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] else: if "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg = msg) continue myThread.transaction.begin() xmlData = self.createInjectionSpec(uninjectedFiles[siteName]) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: # If we get an HTTPException of certain types, raise it as an error if ex.status == 400: # assume it is duplicate injection error. but if that is not the case # needs to be investigated self.filesToRecover = self.createRecoveryFileFormat(uninjectedFiles[siteName]) msg = "PhEDEx injection failed with %s error: %s" % (ex.status, ex.result) raise PhEDExInjectorPassableError(msg) except Exception as ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to inject blocks to PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) logging.info("Injection result: %s" % injectRes) if "error" not in injectRes: for datasetName in uninjectedFiles[siteName]: for blockName in uninjectedFiles[siteName][datasetName]: for file in uninjectedFiles[siteName][datasetName][blockName]["files"]: injectedFiles.append(file["lfn"]) else: msg = ("Error injecting data %s: %s" % (uninjectedFiles[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg = msg) self.setStatus.execute(injectedFiles, 1, conn = myThread.transaction.conn, transaction = myThread.transaction) injectedFiles = [] myThread.transaction.commit() return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS. """ myThread = threading.currentThread() migratedBlocks = self.getMigrated.execute() for siteName in migratedBlocks.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(6, msg = msg) continue myThread.transaction.begin() try: xmlData = self.createInjectionSpec(migratedBlocks[siteName]) injectRes = self.phedex.injectBlocks(location, xmlData) logging.info("Block closing result: %s" % injectRes) except HTTPException as ex: # If we get an HTTPException of certain types, raise it as an error if ex.status == 400: msg = "Received 400 HTTP Error From PhEDEx: %s" % str(ex.result) logging.error(msg) self.sendAlert(6, msg = msg) logging.debug("Blocks: %s" % migratedBlocks[siteName]) logging.debug("XMLData: %s" % xmlData) raise else: msg = "Encountered error while attempting to close blocks in PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) except Exception as ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to close blocks in PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) if "error" not in injectRes: for datasetName in migratedBlocks[siteName]: for blockName in migratedBlocks[siteName][datasetName]: logging.debug("Closing block %s" % blockName) self.setBlockClosed.execute(blockName, conn = myThread.transaction.conn, transaction = myThread.transaction) else: msg = ("Error injecting data %s: %s" % (migratedBlocks[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg = msg) myThread.transaction.commit() return def recoverInjectedFiles(self): """ When PhEDEx inject call timed out, run this function. Since there are 3 min reponse time out in cmsweb, some times PhEDEx injection call times out even though the call succeeded In that case run the recovery mode 1. first check whether files which injection status = 0 are in the PhEDEx. 2. if those file exist set the in_phedex status to 1 3. set self.filesToRecover = None """ myThread = threading.currentThread() injectedFiles = self.phedex.getInjectedFiles(self.filesToRecover) myThread.transaction.begin() self.setStatus.execute(injectedFiles, 1) myThread.transaction.commit() # when files are recovered set the self.file self.filesToRecover = None return injectedFiles def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ myThread = threading.currentThread() try: if self.filesToRecover != None: logging.info(""" Running PhEDExInjector Recovery: previous injection call failed, check if files were injected to PhEDEx anyway""") recoveredFiles = self.recoverInjectedFiles() logging.info("%s files already injected: changed status in dbsbuffer db" % len(recoveredFiles)) self.injectFiles() self.closeBlocks() except PhEDExInjectorPassableError as ex: logging.error("Encountered PassableError in PhEDExInjector") logging.error("Rolling back current transaction and terminating current loop, but not killing component.") if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() pass except Exception: # Guess we should roll back if we actually have an exception if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise return
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] self.diskSites = getattr(config.PhEDExInjector, "diskSites", ["storm-fe-cms.cr.cnaf.infn.it", "srm-cms-disk.gridpp.rl.ac.uk"]) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") self.blocksToRecover = None def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUninjected = daofactory(classname = "GetUninjectedFiles") self.getMigrated = daofactory(classname = "GetMigratedBlocks") daofactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = self.logger, dbinterface = myThread.dbi) self.setStatus = daofactory(classname = "DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname = "SetBlockClosed") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if node["kind"] not in self.seMap: self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for file in fileBlock["files"]: blockSpec.addFile(file["lfn"], file["checksum"], file["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns [{"block1": set(["lfn1", "lfn2"])}, {"block2": set(["lfn3", "lfn4"])] """ blocks = [] for datasetPath in unInjectedData: for blockName, fileBlock in unInjectedData[datasetPath].items(): newBlock = { blockName : set() } for fileDict in fileBlock["files"]: newBlock[blockName].add(fileDict["lfn"]) blocks.append(newBlock) return blocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ myThread = threading.currentThread() uninjectedFiles = self.getUninjected.execute() injectedFiles = [] for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if siteName in self.diskSites: if "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] elif "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] else: if "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg = msg) continue myThread.transaction.begin() xmlData = self.createInjectionSpec(uninjectedFiles[siteName]) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: # If we get an HTTPException of certain types, raise it as an error if ex.status == 400: # assume it is duplicate injection error. but if that is not the case # needs to be investigated self.blocksToRecover = self.createRecoveryFileFormat(uninjectedFiles[siteName]) msg = "PhEDEx injection failed with %s error: %s" % (ex.status, ex.result) raise PhEDExInjectorPassableError(msg) except Exception as ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to inject blocks to PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) logging.info("Injection result: %s" % injectRes) if "error" not in injectRes: for datasetName in uninjectedFiles[siteName]: for blockName in uninjectedFiles[siteName][datasetName]: for file in uninjectedFiles[siteName][datasetName][blockName]["files"]: injectedFiles.append(file["lfn"]) else: msg = ("Error injecting data %s: %s" % (uninjectedFiles[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg = msg) self.setStatus.execute(injectedFiles, 1, conn = myThread.transaction.conn, transaction = myThread.transaction) injectedFiles = [] myThread.transaction.commit() return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS. """ myThread = threading.currentThread() migratedBlocks = self.getMigrated.execute() for siteName in migratedBlocks.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and \ siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and \ siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and \ siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(6, msg = msg) continue myThread.transaction.begin() try: xmlData = self.createInjectionSpec(migratedBlocks[siteName]) injectRes = self.phedex.injectBlocks(location, xmlData) logging.info("Block closing result: %s" % injectRes) except HTTPException as ex: # If we get an HTTPException of certain types, raise it as an error if ex.status == 400: msg = "Received 400 HTTP Error From PhEDEx: %s" % str(ex.result) logging.error(msg) self.sendAlert(6, msg = msg) logging.debug("Blocks: %s" % migratedBlocks[siteName]) logging.debug("XMLData: %s" % xmlData) raise else: msg = "Encountered error while attempting to close blocks in PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) except Exception as ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to close blocks in PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) if "error" not in injectRes: for datasetName in migratedBlocks[siteName]: for blockName in migratedBlocks[siteName][datasetName]: logging.debug("Closing block %s" % blockName) self.setBlockClosed.execute(blockName, conn = myThread.transaction.conn, transaction = myThread.transaction) else: msg = ("Error injecting data %s: %s" % (migratedBlocks[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg = msg) myThread.transaction.commit() return def recoverInjectedFiles(self): """ When PhEDEx inject call timed out, run this function. Since there are 3 min reponse time out in cmsweb, some times PhEDEx injection call times out even though the call succeeded In that case run the recovery mode 1. first check whether files which injection status = 0 are in the PhEDEx. 2. if those file exist set the in_phedex status to 1 3. set self.blocksToRecover = None Run this recovery one block at a time, with too many blocks the call to the PhEDEx data service on cmsweb can time out """ myThread = threading.currentThread() # recover one block at a time for block in self.blocksToRecover: injectedFiles = self.phedex.getInjectedFiles(block) if len(injectedFiles) > 0: myThread.transaction.begin() self.setStatus.execute(injectedFiles, 1) myThread.transaction.commit() logging.info("%s files already injected: changed status in dbsbuffer db" % len(injectedFiles)) self.blocksToRecover = None return def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ myThread = threading.currentThread() try: if self.blocksToRecover != None: logging.info(""" Running PhEDExInjector Recovery: previous injection call failed, check if files were injected to PhEDEx anyway""") self.recoverInjectedFiles() self.injectFiles() self.closeBlocks() except PhEDExInjectorPassableError as ex: logging.error("Encountered PassableError in PhEDExInjector") logging.error("Rolling back current transaction and terminating current loop, but not killing component.") if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() pass except Exception: # Guess we should roll back if we actually have an exception if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL) # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception, ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport
def testNormalModeSubscriptions(self): """ _testNormalModeSubscriptions_ Tests that we can make custodial/non-custodial subscriptions on normal operation mode, this time we don't need WMBS for anything. All is subscribed in one go. Check that the requests are correct. """ self.stuffDatabase() config = self.createConfig() phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(2) try: nodeMappings = phedex.getNodeMap() except Exception: time.sleep(4) nodeMappings = phedex.getNodeMap() subscriber = PhEDExInjectorSubscriber(config, phedex, nodeMappings) subscriber.setup({}) subscriber.algorithm({}) phedexInstance = subscriber.phedex subscriptions = phedexInstance.subRequests # Let's check /BogusPrimary/Run2012Z-PromptReco-v1/RECO # According to the spec, this should be custodial at T1_US_FNAL # Non-custodial at T1_UK_RAL and T3_CO_Uniandes # Autoapproved in all sites # Priority is normal self.assertTrue(self.testDatasetA in subscriptions, "Dataset A was not subscribed") subInfoA = subscriptions[self.testDatasetA] self.assertEqual(len(subInfoA), 3, "Dataset A was not subscribed to all sites") for subInfo in subInfoA: site = subInfo["node"] self.assertEqual(subInfo["priority"], "normal", "Wrong priority for subscription") if site == "T1_UK_RAL_MSS" or site == "T3_CO_Uniandes": self.assertEqual( subInfo["custodial"], "n", "Wrong custodiality for dataset A at %s" % subInfo["node"]) self.assertEqual( subInfo["request_only"], "n", "Wrong requestOnly for dataset A at %s" % subInfo["node"]) self.assertEqual( subInfo["move"], "n", "Wrong subscription type for dataset A at %s" % subInfo["node"]) elif site == "T1_US_FNAL_MSS": self.assertEqual( subInfo["custodial"], "y", "Wrong custodiality for dataset A at %s" % subInfo["node"]) self.assertEqual( subInfo["request_only"], "n", "Wrong requestOnly for dataset A at %s" % subInfo["node"]) self.assertEqual( subInfo["move"], "y", "Wrong subscription type for dataset A at %s" % subInfo["node"]) else: self.fail("Dataset A was subscribed to a wrong site %s" % site) # Now check /BogusPrimary/CRUZET11-v1/RAW # According to the spec, this is not custodial anywhere # Non-custodial at T1_UK_RAL and T2_CH_CERN # Request only at both sites and with high priority self.assertTrue(self.testDatasetB in subscriptions, "Dataset B was not subscribed") subInfoB = subscriptions[self.testDatasetB] self.assertEqual(len(subInfoB), 2, "Dataset B was not subscribed to all sites") for subInfo in subInfoB: site = subInfo["node"] self.assertEqual(subInfo["priority"], "high", "Wrong priority for subscription") if site == "T1_UK_RAL_MSS" or site == "T2_CH_CERN": self.assertEqual( subInfo["custodial"], "n", "Wrong custodiality for dataset B at %s" % subInfo["node"]) self.assertEqual( subInfo["request_only"], "y", "Wrong requestOnly for dataset B at %s" % subInfo["node"]) self.assertEqual( subInfo["move"], "n", "Wrong subscription type for dataset B at %s" % subInfo["node"]) else: self.fail("Dataset B was subscribed to a wrong site %s" % site) myThread = threading.currentThread() result = myThread.dbi.processData( "SELECT COUNT(*) FROM dbsbuffer_dataset_subscription where subscribed = 1" )[0].fetchall() self.assertEqual(result[0][0], 5, "Not all datasets were marked as subscribed") result = myThread.dbi.processData( "SELECT site FROM dbsbuffer_dataset_subscription where subscribed = 0" )[0].fetchall() self.assertEqual(result[0][0], "T1_IT_CNAF", "A non-valid CMS site was subscribed") # Reset and run again and make sure that no duplicate subscriptions are created myThread.dbi.processData( "UPDATE dbsbuffer_dataset_subscription SET subscribed = 0") subscriber.algorithm({}) self.assertEqual(len(subscriptions[self.testDatasetA]), 3) self.assertEqual(len(subscriptions[self.testDatasetB]), 2) return
class Getter(object): """ Get transfers to be submitted """ def __init__(self, config, quiet, debug, test=False): """ initialize log, connections etc """ self.config = config.Getter self.TEST = False createLogdir('Monitor') def setRootLogger(quiet, debug): """ Taken from CRABServer TaskWorker Sets the root logger with the desired verbosity level The root logger logs to logs/asolog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') if self.TEST: # if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/submitter.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger try: self.phedex = PhEDEx(responseType='xml', dict={ 'key': self.config.opsProxy, 'cert': self.config.opsProxy }) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) self.documents = dict() self.doc_acq = '' self.STOP = False self.logger = setRootLogger(quiet, debug) self.q = Queue() self.active_lfns = list() self.Update = update(self.logger, self.config) self.site_tfc_map = {} for site in [ x['name'] for x in json.loads(self.phedex.getNodeMap())['phedex']['node'] ]: if site and str(site) != 'None' and str(site) != 'unknown': self.site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('tfc site: %s %s' % (site, self.get_tfc_rules(site))) def algorithm(self): """ - Get Users - Get Source dest - create queue for each (user, link) - feed threads """ workers = list() for i in range(self.config.max_threads_num): worker = Thread(target=self.worker, args=(i, self.q)) worker.setDaemon(True) worker.start() workers.append(worker) site_tfc_map = dict() while not self.STOP: sites, users = self.oracleSiteUser(self.Update) self.Update.retry() for _user in users: for source in sites: for dest in sites: lfns = [[x['source_lfn'], x['destination_lfn']] for x in self.documents if x['source'] == source and x['destination'] == dest and x['username'] == _user[0] and x not in self.active_lfns] self.active_lfns = self.active_lfns + lfns # IMPORTANT: remove only on final states for files in chunks(lfns, self.config.files_per_job): self.q.put((files, _user, source, dest, self.site_tfc_map)) self.logger.debug('Queue lenght: %s' % self.q.qsize()) time.sleep(4) for w in workers: w.join() self.logger.info('Submitter stopped.') def oracleSiteUser(self, Update): """ 1. Acquire transfers from DB 2. Get acquired users and destination sites """ # TODO: flexible with other DBs and get users list users = Update.acquire() if users != 1: self.documents = Update.getAcquired(users) for doc in self.documents: if doc['user_role'] is None: doc['user_role'] = "" if doc['user_group'] is None: doc['user_group'] = "" unique_users = list() try: unique_users = [ list(i) for i in set( tuple([x['username'], x['user_group'], x['user_role']]) for x in self.documents) ] except Exception as ex: self.logger.error("Failed to map active users: %s" % ex) if len(unique_users) <= self.config.pool_size: active_users = unique_users else: active_users = unique_users[:self.config.pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) active_sites_dest = [x['destination'] for x in self.documents] active_sites = active_sites_dest + [ x['source'] for x in self.documents ] self.logger.debug('Active sites are: %s' % list(set(active_sites))) return list(set(active_sites)), active_users def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ tfc_file = None try: self.phedex.getNodeTFC(site) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception as e: self.logger.exception('PhEDEx cache exception: %s' % e) return readTFC(tfc_file) def critical_failure(self, lfns, lock, inputs): """ if an exception occurs before the end, remove lfns from active to let it be reprocessed later. :param lfns: :param lock: :param inputs: :return: """ lock.acquire() for lfn in lfns: self.active_lfns.remove(lfn) lock.release() inputs.task_done() def worker(self, i, inputs): """ - Retrieve userDN - Retrieve user proxy - Delegate proxy to fts is needed - submit fts job - update doc states :param i: thread number :param inputs: tuple (lfns, _user, source, dest, tfc_map) :return: """ # TODO: differentiate log messages per USER! logger = self.logger logger.info("Process %s is starting. PID %s", i, os.getpid()) lock = Lock() Update = update(logger, self.config) while not self.STOP: if inputs.empty(): time.sleep(10) continue try: lfns, _user, source, dest, tfc_map = inputs.get() [user, group, role] = _user except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) continue start = time.time() if not self.config.TEST: try: userDN = getDNFromUserName(user, logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: logger.exception('Cannot retrieve user DN') self.critical_failure(lfns, lock, inputs) continue defaultDelegation = { 'logger': logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': '', 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } cache_area = self.config.cache_area try: defaultDelegation['myproxyAccount'] = re.compile( 'https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: logger.error( 'MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) self.critical_failure(lfns, lock, inputs) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group defaultDelegation['role'] = role logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, logger) if not valid_proxy: logger.error( 'Failed to retrieve user proxy... putting docs on retry' ) logger.error( 'docs on retry: %s' % Update.failed(lfns, submission_error=True)) continue except Exception: logger.exception('Error retrieving proxy') self.critical_failure(lfns, lock, inputs) continue else: user_proxy = self.config.opsProxy self.logger.debug("Using opsProxy for testmode") context = dict() try: if self.config.TEST: logger.debug("Running in test mode, submitting fake jobs") else: context = fts3.Context(self.config.serverFTS, user_proxy, user_proxy, verify=True) logger.debug( fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) except Exception: logger.exception("Error submitting to FTS") self.critical_failure(lfns, lock, inputs) continue failed_lfn = list() try: if self.config.TEST: submitted_lfn = lfns jobid = getHashLfn(lfns[0][0]) self.logger.debug('Fake job id: ' + jobid) else: failed_lfn, submitted_lfn, jobid = Submission( lfns, source, dest, i, self.logger, fts3, context, tfc_map) if jobid == -1: self.critical_failure(lfns, lock, inputs) continue logger.info('Submitted %s files' % len(submitted_lfn)) except Exception: logger.exception("Unexpected error during FTS job submission!") self.critical_failure(lfns, lock, inputs) continue # TODO: add file FTS id and job id columns for kill command try: Update.submitted(lfns) except Exception: logger.exception("Error updating document status") self.critical_failure(lfns, lock, inputs) continue try: Update.failed(failed_lfn) except Exception: logger.exception( "Error updating document status, job submission will be retried later..." ) self.critical_failure(lfns, lock, inputs) continue try: createLogdir('Monitor/' + user) with open('Monitor/' + user + '/' + str(jobid) + '.txt', 'w') as outfile: json.dump(lfns, outfile) logger.info('Monitor files created') except Exception: logger.exception("Error creating file for monitor") self.critical_failure(lfns, lock, inputs) continue end = time.time() self.logger.info('Input processed in %s', str(end - start)) time.sleep(0.5) logger.debug("Worker %s exiting.", i) return 0 def quit_(self): """ set STOP to True :return: """ self.logger.info( "Received kill request. Setting STOP flag in the master and threads..." ) self.STOP = True
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] self.diskSites = getattr(config.PhEDExInjector, "diskSites", ["storm-fe-cms.cr.cnaf.infn.it", "srm-cms-disk.gridpp.rl.ac.uk"]) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") self.filesToRecover = None def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUninjected = daofactory(classname = "GetUninjectedFiles") self.getMigrated = daofactory(classname = "GetMigratedBlocks") daofactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = self.logger, dbinterface = myThread.dbi) self.setStatus = daofactory(classname = "DBSBufferFiles.SetPhEDExStatus") daofactory = DAOFactory(package = "WMComponent.DBSUpload.Database", logger = self.logger, dbinterface = myThread.dbi) self.setBlockStatus = daofactory(classname = "SetBlockStatus") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if not self.seMap.has_key(node["kind"]): self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for file in fileBlock["files"]: blockSpec.addFile(file["lfn"], file["checksum"], file["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns {"block1": set(["lfn1", "lfn2"])} """ sortedBlocks = defaultdict(set) for datasetPath in unInjectedData: for fileBlockName, fileBlock in unInjectedData[datasetPath].iteritems(): for fileDict in fileBlock["files"]: sortedBlocks[fileBlockName].add(fileDict["lfn"]) return sortedBlocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ myThread = threading.currentThread() uninjectedFiles = self.getUninjected.execute() injectedFiles = [] for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if siteName in self.diskSites: if self.seMap.has_key("Disk") and \ self.seMap["Disk"].has_key(siteName): location = self.seMap["Disk"][siteName] elif self.seMap.has_key("Buffer") and \ self.seMap["Buffer"].has_key(siteName): location = self.seMap["Buffer"][siteName] elif self.seMap.has_key("MSS") and \ self.seMap["MSS"].has_key(siteName): location = self.seMap["MSS"][siteName] else: if self.seMap.has_key("Buffer") and \ self.seMap["Buffer"].has_key(siteName): location = self.seMap["Buffer"][siteName] elif self.seMap.has_key("MSS") and \ self.seMap["MSS"].has_key(siteName): location = self.seMap["MSS"][siteName] elif self.seMap.has_key("Disk") and \ self.seMap["Disk"].has_key(siteName): location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg = msg) continue myThread.transaction.begin() xmlData = self.createInjectionSpec(uninjectedFiles[siteName]) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException, ex: # If we get an HTTPException of certain types, raise it as an error if ex.status == 400: # assume it is duplicate injection error. but if that is not the case # needs to be investigated self.filesToRecover = self.createRecoveryFileFormat(uninjectedFiles[siteName]) msg = "PhEDEx injection failed with %s error: %s" % (ex.status, ex.result) raise PhEDExInjectorPassableError(msg) except Exception, ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to inject blocks to PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) logging.info("Injection result: %s" % injectRes) if not injectRes.has_key("error"): for datasetName in uninjectedFiles[siteName]: for blockName in uninjectedFiles[siteName][datasetName]: for file in uninjectedFiles[siteName][datasetName][blockName]["files"]: injectedFiles.append(file["lfn"]) else: msg = ("Error injecting data %s: %s" % (uninjectedFiles[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg = msg) self.setStatus.execute(injectedFiles, 1, conn = myThread.transaction.conn, transaction = myThread.transaction) injectedFiles = [] myThread.transaction.commit()
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.dbsUrl = config.DBSInterface.globalDBSUrl self.pollCounter = 0 self.subFrequency = None if getattr(config.PhEDExInjector, "subscribeDatasets", False): pollInterval = config.PhEDExInjector.pollInterval subInterval = config.PhEDExInjector.subscribeInterval self.subFrequency = max(1, int(round(subInterval/pollInterval))) logging.info("SubscribeDataset and deleteBlocks will run every %d polling cycles", self.subFrequency) # subscribe on first cycle self.pollCounter = self.subFrequency - 1 # retrieving the node mappings is fickle and can fail quite often self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(2) try: nodeMappings = self.phedex.getNodeMap() except: time.sleep(4) nodeMappings = self.phedex.getNodeMap() # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] for node in nodeMappings["phedex"]["node"]: if node["kind"] not in self.seMap: self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s", node["se"], node["name"]) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) self.phedexNodes = {'MSS': [], 'Disk': []} for node in nodeMappings["phedex"]["node"]: if node["kind"] in ["MSS", "Disk"]: self.phedexNodes[node["kind"]].append(node["name"]) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "PhEDExInjector") self.blocksToRecover = [] return def setup(self, parameters): """ _setup_ Create DAO Factory and setup some DAO. """ myThread = threading.currentThread() daofactory = DAOFactory(package = "WMComponent.PhEDExInjector.Database", logger = self.logger, dbinterface = myThread.dbi) self.getUninjected = daofactory(classname = "GetUninjectedFiles") self.getMigrated = daofactory(classname = "GetMigratedBlocks") self.findDeletableBlocks = daofactory(classname = "GetDeletableBlocks") self.markBlocksDeleted = daofactory(classname = "MarkBlocksDeleted") self.getUnsubscribed = daofactory(classname = "GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname = "MarkDatasetSubscribed") daofactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = self.logger, dbinterface = myThread.dbi) self.setStatus = daofactory(classname = "DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname = "SetBlockClosed") return def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and attempt to inject them into PhEDEx. """ logging.info("Running PhEDEx injector poller algorithm...") self.pollCounter += 1 if self.blocksToRecover: logging.info("""PhEDExInjector Recovery: previous injection call failed, check if files were injected to PhEDEx anyway""") self.recoverInjectedFiles() self.injectFiles() self.closeBlocks() if self.pollCounter == self.subFrequency: self.pollCounter = 0 self.deleteBlocks() self.subscribeDatasets() return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Transform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for f in fileBlock["files"]: blockSpec.addFile(f["lfn"], f["checksum"], f["size"]) return injectionSpec.save() def createRecoveryFileFormat(self, unInjectedData): """ _createRecoveryFileFormat_ Transform the data structure returned from database in to the dict format for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. unInjectedData format {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} returns [{"block1": set(["lfn1", "lfn2"])}, {"block2": set(["lfn3", "lfn4"])] """ blocks = [] for datasetPath in unInjectedData: for blockName, fileBlock in unInjectedData[datasetPath].items(): newBlock = { blockName: set() } for fileDict in fileBlock["files"]: newBlock[blockName].add(fileDict["lfn"]) blocks.append(newBlock) return blocks def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ logging.info("Starting injectFiles method") uninjectedFiles = self.getUninjected.execute() for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg = msg) continue maxDataset = 20 maxBlocks = 50 maxFiles = 5000 numberDatasets = 0 numberBlocks = 0 numberFiles = 0 injectData = {} lfnList = [] for dataset in uninjectedFiles[siteName]: numberDatasets += 1 injectData[dataset] = uninjectedFiles[siteName][dataset] for block in injectData[dataset]: numberBlocks += 1 numberFiles += len(injectData[dataset][block]['files']) for fileInfo in injectData[dataset][block]['files']: lfnList.append(fileInfo['lfn']) if numberDatasets >= maxDataset or numberBlocks >= maxBlocks or numberFiles >= maxFiles: self.injectFilesPhEDExCall(location, injectData, lfnList) numberDatasets = 0 numberBlocks = 0 numberFiles = 0 injectData = {} lfnList = [] if injectData: self.injectFilesPhEDExCall(location, injectData, lfnList) return def injectFilesPhEDExCall(self, location, injectData, lfnList): """ _injectFilesPhEDExCall_ actual PhEDEx call for file injection """ xmlData = self.createInjectionSpec(injectData) logging.debug("injectFiles XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: # HTTPException with status 400 assumed to be duplicate injection # trigger later block recovery (investgation needed if not the case) if ex.status == 400: self.blocksToRecover.extend( self.createRecoveryFileFormat(injectData) ) logging.error("PhEDEx file injection failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx file injection failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: logging.info("Injection result: %s", injectRes) if "error" in injectRes: msg = "Error injecting data %s: %s" % (injectData, injectRes["error"]) logging.error(msg) self.sendAlert(6, msg = msg) else: try: self.setStatus.execute(lfnList, 1) except: # possible deadlock with DBS3Upload, retry once after 5s logging.warning("Oracle exception during file status update, possible deadlock due to race condition, retry after 5s sleep") time.sleep(5) self.setStatus.execute(lfnList, 1) return def closeBlocks(self): """ _closeBlocks_ Close any blocks that have been migrated to global DBS """ logging.info("Starting closeBlocks method") migratedBlocks = self.getMigrated.execute() for siteName in migratedBlocks.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if "Buffer" in self.seMap and siteName in self.seMap["Buffer"]: location = self.seMap["Buffer"][siteName] elif "MSS" in self.seMap and siteName in self.seMap["MSS"]: location = self.seMap["MSS"][siteName] elif "Disk" in self.seMap and siteName in self.seMap["Disk"]: location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(6, msg = msg) continue xmlData = self.createInjectionSpec(migratedBlocks[siteName]) logging.debug("closeBlocks XMLData: %s", xmlData) try: injectRes = self.phedex.injectBlocks(location, xmlData) except HTTPException as ex: logging.error("PhEDEx block close failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx block close failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: logging.info("Block closing result: %s", injectRes) if "error" not in injectRes: for datasetName in migratedBlocks[siteName]: for blockName in migratedBlocks[siteName][datasetName]: logging.debug("Closing block %s", blockName) self.setBlockClosed.execute(blockName) else: msg = "Error injecting data %s: %s" % (migratedBlocks[siteName], injectRes["error"]) logging.error(msg) self.sendAlert(6, msg = msg) return def recoverInjectedFiles(self): """ When PhEDEx inject call timed out, run this function. Since there are 3 min reponse time out in cmsweb, some times PhEDEx injection call times out even though the call succeeded In that case run the recovery mode 1. first check whether files which injection status = 0 are in the PhEDEx. 2. if those file exist set the in_phedex status to 1 3. set self.blocksToRecover = [] Run this recovery one block at a time, with too many blocks the call to the PhEDEx data service on cmsweb can time out """ # recover one block at a time for block in self.blocksToRecover: injectedFiles = self.phedex.getInjectedFiles(block) if injectedFiles: self.setStatus.execute(injectedFiles, 1) self.blocksToRecover = [] return def deleteBlocks(self): """ _deleteBlocks_ Find deletable blocks, then decide if to delete based on: Is there an active subscription for dataset or block ? If yes => set deleted=2 If no => next check Has transfer to all destinations finished ? If yes => request block deletion, approve request, set deleted=1 If no => do nothing (check again next cycle) """ logging.info("Starting deleteBlocks method") blockDict = self.findDeletableBlocks.execute(transaction = False) if not blockDict: return try: subscriptions = self.phedex.getSubscriptionMapping(*blockDict.keys()) except: logging.error("Couldn't get subscription info from PhEDEx, retry next cycle") return skippableBlocks = [] deletableEntries = {} for blockName in blockDict: location = blockDict[blockName]['location'] # should never be triggered, better safe than sorry if location.endswith('_MSS'): logging.debug("Location %s for block %s is MSS, skip deletion", location, blockName) skippableBlocks.append(blockName) continue dataset = blockDict[blockName]['dataset'] sites = blockDict[blockName]['sites'] if blockName in subscriptions and location in subscriptions[blockName]: logging.debug("Block %s subscribed to %s, skip deletion", blockName, location) binds = { 'DELETED': 2, 'BLOCKNAME': blockName } self.markBlocksDeleted.execute(binds) else: blockInfo = [] try: blockInfo = self.phedex.getReplicaInfoForBlocks(block = blockName, complete = 'y')['phedex']['block'] except: logging.error("Couldn't get block info from PhEDEx, retry next cycle") else: for entry in blockInfo: if entry['name'] == blockName: nodes = set([x['node'] for x in entry['replica']]) if location not in nodes: logging.debug("Block %s not present on %s, mark as deleted", blockName, location) binds = { 'DELETED': 1, 'BLOCKNAME': blockName } self.markBlocksDeleted.execute(binds) elif sites.issubset(nodes): logging.debug("Deleting block %s from %s since it is fully transfered", blockName, location) if location not in deletableEntries: deletableEntries[location] = {} if dataset not in deletableEntries[location]: deletableEntries[location][dataset] = set() deletableEntries[location][dataset].add(blockName) binds = [] for blockName in skippableBlocks: binds.append( { 'DELETED': 2, 'BLOCKNAME': blockName } ) if binds: self.markBlocksDeleted.execute(binds) for location in deletableEntries: chunkSize = 100 numberOfBlocks = 0 blocksToDelete = {} for dataset in deletableEntries[location]: blocksToDelete[dataset] = deletableEntries[location][dataset] numberOfBlocks += len(blocksToDelete[dataset]) if numberOfBlocks > chunkSize: self.deleteBlocksPhEDExCalls(location, blocksToDelete) numberOfBlocks = 0 blocksToDelete = {} self.deleteBlocksPhEDExCalls(location, blocksToDelete) return def deleteBlocksPhEDExCalls(self, location, blocksToDelete): """ _deleteBlocksPhEDExCalls_ actual PhEDEx calls for block deletion """ deletion = PhEDExDeletion(blocksToDelete.keys(), location, level = 'block', comments = "WMAgent blocks auto-delete from %s" % location, blocks = blocksToDelete) xmlData = XMLDrop.makePhEDExXMLForBlocks(self.dbsUrl, deletion.getDatasetsAndBlocks()) logging.debug("deleteBlocks XMLData: %s", xmlData) try: response = self.phedex.delete(deletion, xmlData) requestId = response['phedex']['request_created'][0]['id'] # auto-approve deletion request self.phedex.updateRequest(requestId, 'approve', location) except HTTPException as ex: logging.error("PhEDEx block delete/approval failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx block delete/approval failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: binds = [] for dataset in blocksToDelete: for blockName in blocksToDelete[dataset]: binds.append( { 'DELETED': 1, 'BLOCKNAME': blockName } ) self.markBlocksDeleted.execute(binds) return def subscribeDatasets(self): """ _subscribeDatasets_ Poll the database for datasets and subscribe them. """ logging.info("Starting subscribeDatasets method") # Check for completely unsubscribed datasets unsubscribedDatasets = self.getUnsubscribed.execute() # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create a list of subscriptions as defined by the PhEDEx data structures subs = SubscriptionList() # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: site = subInfo['site'] if site not in self.phedexNodes['MSS'] and site not in self.phedexNodes['Disk']: msg = "Site %s doesn't appear to be valid to PhEDEx, " % site msg += "skipping subscription: %s" % subInfo['id'] logging.error(msg) self.sendAlert(7, msg = msg) continue # Avoid custodial subscriptions to disk nodes if site not in self.phedexNodes['MSS']: subInfo['custodial'] = 'n' # Avoid auto approval in T1 sites elif site.startswith("T1"): subInfo['request_only'] = 'y' phedexSub = PhEDExSubscription(subInfo['path'], site, subInfo['phedex_group'], priority = subInfo['priority'], move = subInfo['move'], custodial = subInfo['custodial'], request_only = subInfo['request_only'], subscriptionId = subInfo['id']) # Check if the subscription is a duplicate if phedexSub.matchesExistingSubscription(self.phedex) or \ phedexSub.matchesExistingTransferRequest(self.phedex): subscriptionsMade.append(subInfo['id']) continue # Add it to the list subs.addSubscription(phedexSub) # Compact the subscriptions subs.compact() for subscription in subs.getSubscriptionList(): xmlData = XMLDrop.makePhEDExXMLForDatasets(self.dbsUrl, subscription.getDatasetPaths()) logging.debug("subscribeDatasets XMLData: %s" , xmlData) logging.info("Subscribing: %s to %s, with options: Move: %s, Custodial: %s, Request Only: %s", subscription.getDatasetPaths(), subscription.getNodes(), subscription.move, subscription.custodial, subscription.request_only) try: self.phedex.subscribe(subscription, xmlData) except HTTPException as ex: logging.error("PhEDEx dataset subscribe failed with HTTPException: %s %s", ex.status, ex.result) except Exception as ex: logging.error("PhEDEx dataset subscribe failed with Exception: %s", str(ex)) logging.debug("Traceback: %s", str(traceback.format_exc())) else: subscriptionsMade.extend(subscription.getSubscriptionIds()) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade) return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.insertWorkflow = self.dbsDaoFactory(classname = "InsertWorkflow") self.dbsNewAlgoAction = self.dbsDaoFactory(classname = "NewAlgo") self.dbsNewDatasetAction = self.dbsDaoFactory(classname = "NewDataset") self.dbsAssocAction = self.dbsDaoFactory(classname = "AlgoDatasetAssoc") self.dbsExistsAction = self.dbsDaoFactory(classname = "DBSBufferFiles.ExistsForAccountant") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.dbsSetDatasetAlgoAction = self.dbsDaoFactory(classname = "SetDatasetAlgo") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = collections.deque(maxlen = 1000) self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://","") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception, ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport
class PhEDExInjectorPoller(BaseWorkerThread): """ _PhEDExInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.phedex = PhEDEx({"endpoint": config.PhEDExInjector.phedexurl}, "json") self.dbsUrl = config.DBSInterface.globalDBSUrl self.group = getattr(config.PhEDExInjector, "group", "DataOps") # This will be used to map SE names which are stored in the DBSBuffer to # PhEDEx node names. The first key will be the "kind" which consists # of one of the following: MSS, Disk, Buffer. The next key will be the # SE name. self.seMap = {} self.nodeNames = [] # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="PhEDExInjector") def setup(self, parameters): """ _setup_ Create a DAO Factory for the PhEDExInjector. Also load the SE names to PhEDEx node name mappings from the data service. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.PhEDExInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") daofactory = DAOFactory(package="WMComponent.DBSBuffer.Database", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") daofactory = DAOFactory(package="WMComponent.DBSUpload.Database", logger=self.logger, dbinterface=myThread.dbi) self.setBlockStatus = daofactory(classname="SetBlockStatus") nodeMappings = self.phedex.getNodeMap() for node in nodeMappings["phedex"]["node"]: if not self.seMap.has_key(node["kind"]): self.seMap[node["kind"]] = {} logging.info("Adding mapping %s -> %s" % (node["se"], node["name"])) self.seMap[node["kind"]][node["se"]] = node["name"] self.nodeNames.append(node["name"]) return def createInjectionSpec(self, injectionData): """ _createInjectionSpec_ Trasform the data structure returned from the database into an XML string for the PhEDEx Data Service. The injectionData parameter must be a dictionary keyed by dataset path. Each dataset path will map to a list of blocks, each block being a dict. The block dicts will have three keys: name, is-open and files. The files key will be a list of dicts, each of which have the following keys: lfn, size and checksum. The following is an example object: {"dataset1": {"block1": {"is-open": "y", "files": [{"lfn": "lfn1", "size": 10, "checksum": {"cksum": "1234"}}, {"lfn": "lfn2", "size": 20, "checksum": {"cksum": "4321"}}]}}} """ injectionSpec = XMLDrop.XMLInjectionSpec(self.dbsUrl) for datasetPath in injectionData: datasetSpec = injectionSpec.getDataset(datasetPath) for fileBlockName, fileBlock in injectionData[ datasetPath].iteritems(): blockSpec = datasetSpec.getFileblock(fileBlockName, fileBlock["is-open"]) for file in fileBlock["files"]: blockSpec.addFile(file["lfn"], file["checksum"], file["size"]) return injectionSpec.save() def injectFiles(self): """ _injectFiles_ Inject any uninjected files in PhEDEx. """ myThread = threading.currentThread() uninjectedFiles = self.getUninjected.execute() injectedFiles = [] for siteName in uninjectedFiles.keys(): # SE names can be stored in DBSBuffer as that is what is returned in # the framework job report. We'll try to map the SE name to a # PhEDEx node name here. location = None if siteName in self.nodeNames: location = siteName else: if self.seMap.has_key("Buffer") and \ self.seMap["Buffer"].has_key(siteName): location = self.seMap["Buffer"][siteName] elif self.seMap.has_key("MSS") and \ self.seMap["MSS"].has_key(siteName): location = self.seMap["MSS"][siteName] elif self.seMap.has_key("Disk") and \ self.seMap["Disk"].has_key(siteName): location = self.seMap["Disk"][siteName] if location == None: msg = "Could not map SE %s to PhEDEx node." % siteName logging.error(msg) self.sendAlert(7, msg=msg) continue myThread.transaction.begin() xmlData = self.createInjectionSpec(uninjectedFiles[siteName]) try: injectRes = self.phedex.injectBlocks(location, xmlData) except Exception, ex: # If we get an error here, assume that it's temporary (it usually is) # log it, and ignore it in the algorithm() loop msg = "Encountered error while attempting to inject blocks to PhEDEx.\n" msg += str(ex) logging.error(msg) logging.debug("Traceback: %s" % str(traceback.format_exc())) raise PhEDExInjectorPassableError(msg) logging.info("Injection result: %s" % injectRes) if not injectRes.has_key("error"): for datasetName in uninjectedFiles[siteName]: for blockName in uninjectedFiles[siteName][datasetName]: for file in uninjectedFiles[siteName][datasetName][ blockName]["files"]: injectedFiles.append(file["lfn"]) else: msg = ("Error injecting data %s: %s" % (uninjectedFiles[siteName], injectRes["error"])) logging.error(msg) self.sendAlert(6, msg=msg) self.setStatus.execute(injectedFiles, 1, conn=myThread.transaction.conn, transaction=myThread.transaction) injectedFiles = [] myThread.transaction.commit() return