def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = "TestWorkload" workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl") testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, "created", "new") changer.propagate(testJobGroup.jobs, "executing", "created") changer.propagate(testJobGroup.jobs, "complete", "executing") changer.propagate(testJobGroup.jobs, "jobfailed", "complete") testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state="JobFailed") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="JobCooloff") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="Exhausted") self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def setupPromptRecoWorkflow(self): """ _setupPromptRecoWorkflow_ Populate WMBS with a real PromptReco workflow, every subscription must be unfinished at first """ # Populate disk and WMBS testArguments = PromptRecoWorkloadFactory.getTestArguments() workflowName = 'PromptReco_Run195360_Cosmics' factory = PromptRecoWorkloadFactory() testArguments["EnableHarvesting"] = True testArguments["CouchURL"] = os.environ["COUCHURL"] workload = factory.factoryWorkloadConstruction(workflowName, testArguments) wmbsHelper = WMBSHelper(workload, 'Reco', 'SomeBlock', cachepath=self.testDir) wmbsHelper.createTopLevelFileset() wmbsHelper._createSubscriptionsInWMBS(wmbsHelper.topLevelTask, wmbsHelper.topLevelFileset) self.stateMap = {'AlcaSkim': [], 'Merge': [], 'Harvesting': [], 'Processing Done': []} self.orderedStates = ['AlcaSkim', 'Merge', 'Harvesting', 'Processing Done'] # Populate WMStats self.requestDBWriter.insertGenericRequest({'RequestName': workflowName}) self.requestDBWriter.updateRequestStatus(workflowName, 'Closed') topLevelTask = '/%s/Reco' % workflowName alcaSkimTask = '%s/AlcaSkim' % topLevelTask mergeTasks = ['%s/AlcaSkim/AlcaSkimMergeALCARECOStreamHcalCalHOCosmics', '%s/AlcaSkim/AlcaSkimMergeALCARECOStreamTkAlCosmics0T', '%s/AlcaSkim/AlcaSkimMergeALCARECOStreamMuAlGlobalCosmics', '%s/RecoMergewrite_AOD', '%s/RecoMergewrite_DQM', '%s/RecoMergewrite_RECO'] harvestingTask = '%s/RecoMergewrite_DQM/RecoMergewrite_DQMEndOfRunDQMHarvestMerged' % topLevelTask self.stateMap['AlcaSkim'].append(wmbsHelper.topLevelSubscription) alcaSkimWorkflow = Workflow(name=workflowName, task=alcaSkimTask) alcaSkimWorkflow.load() alcarecoFileset = Fileset(name='/PromptReco_Run195360_Cosmics/Reco/unmerged-write_ALCARECOALCARECO') alcarecoFileset.load() alcaSkimSub = Subscription(alcarecoFileset, alcaSkimWorkflow) alcaSkimSub.load() self.stateMap['Merge'].append(alcaSkimSub) for task in mergeTasks: mergeTask = task % topLevelTask mergeWorkflow = Workflow(name=workflowName, task=mergeTask) mergeWorkflow.load() if 'AlcaSkim' in mergeTask: stream = mergeTask.split('/')[-1][13:] unmergedFileset = Fileset(name='%s/unmerged-%sALCARECO' % (alcaSkimTask, stream)) unmergedFileset.load() else: dataTier = mergeTask.split('/')[-1].split('_')[-1] unmergedFileset = Fileset(name='%s/unmerged-write_%s%s' % (topLevelTask, dataTier, dataTier)) unmergedFileset.load() mergeSub = Subscription(unmergedFileset, mergeWorkflow) mergeSub.load() self.stateMap['Harvesting'].append(mergeSub) harvestingWorkflow = Workflow(name=workflowName, task=harvestingTask) harvestingWorkflow.load() harvestingFileset = Fileset(name='/PromptReco_Run195360_Cosmics/Reco/RecoMergewrite_DQM/merged-MergedDQM') harvestingFileset.load() harvestingSub = Subscription(harvestingFileset, harvestingWorkflow) harvestingSub.load() self.stateMap['Processing Done'].append(harvestingSub) return
def __call__(self, parameters): """ Poller for looking in all active subscriptions for jobs that need to be made. """ logging.info("In JobCreatorWorker.__call__") myThread = threading.currentThread() for entry in parameters: # This retrieves a single subscription subscriptionID = entry.get('subscription') if subscriptionID < 0: logging.error("Got non-existant subscription") logging.error("Assuming parameters in error: returning") return subscriptionID myThread.transaction.begin() logging.info("About to call subscription %i" %subscriptionID) wmbsSubscription = Subscription(id = subscriptionID) wmbsSubscription.load() wmbsSubscription["workflow"].load() workflow = wmbsSubscription["workflow"] wmWorkload = retrieveWMSpec(wmbsSubscription) if not workflow.task or not wmWorkload: # Then we have a problem # We have no sandbox # We NEED a sandbox # Abort this subscription! # But do NOT fail # We have no way of marking a subscription as bad per se # We'll have to just keep skipping it wmTask = None seederList = [] logging.error("Have no task for workflow %i" % (workflow.id)) logging.error("Aborting Subscription %i" % (subscriptionID)) continue else: wmTask = wmWorkload.getTaskByPath(workflow.task) if hasattr(wmTask.data, 'seeders'): manager = SeederManager(wmTask) seederList = manager.getSeederList() else: seederList = [] logging.info("About to enter JobFactory") logging.debug("Going to call wmbsJobFactory with limit %i" % (self.limit)) # My hope is that the job factory is smart enough only to split un-split jobs wmbsJobFactory = self.splitterFactory(package = "WMCore.WMBS", subscription = wmbsSubscription, generators=seederList, limit = self.limit) splitParams = retrieveJobSplitParams(wmWorkload, workflow.task) logging.debug("Split Params: %s" % splitParams) continueSubscription = True myThread.transaction.commit() # Turn on the jobFactory myThread.transaction.begin() wmbsJobFactory.open() # Create a function to hold it jobSplittingFunction = runSplitter(jobFactory = wmbsJobFactory, splitParams = splitParams) while continueSubscription: # This loop runs over the jobFactory, # using yield statements and a pre-existing proxy to # generate and process new jobs # First we need the jobs. try: wmbsJobGroups = next(jobSplittingFunction) logging.info("Retrieved %i jobGroups from jobSplitter" % (len(wmbsJobGroups))) except StopIteration: # If you receive a stopIteration, we're done logging.info("Completed iteration over subscription %i" % (subscriptionID)) continueSubscription = False continue # Now we get to find out what job they are. countJobs = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = countJobs.execute(workflow = workflow.id, conn = myThread.transaction.conn, transaction = True) logging.debug("Have %i jobs for this workflow already" % (jobNumber)) for wmbsJobGroup in wmbsJobGroups: logging.debug("Processing jobGroup %i" % (wmbsJobGroup.exists())) logging.debug("Processing %i jobs" % (len(wmbsJobGroup.jobs)) ) # Create a directory self.createWorkArea.processJobs(jobGroup = wmbsJobGroup, startDir = self.jobCacheDir, workflow = workflow, wmWorkload = wmWorkload, transaction = myThread.transaction, conn = myThread.transaction.conn) for job in wmbsJobGroup.jobs: jobNumber += 1 self.saveJob(job = job, workflow = workflow, wmTask = wmTask, jobNumber = jobNumber) self.advanceJobGroup(wmbsJobGroup) logging.debug("Finished call for jobGroup %i" \ % (wmbsJobGroup.exists())) # END: while loop over jobSplitter myThread.transaction.commit() # About to reset everything wmbsJobGroups = None wmTask = None wmWorkload = None splitParams = None wmbsJobFactory = None gc.collect() # About to check memory doMemoryCheck("About to get memory references: End of subscription loop") # Final memory check doMemoryCheck("About to get memory references: End of __call__()") logging.debug("About to return from JobCreatorWorker.__call__()") return parameters
def _createSubscriptionsInWMBS(self, task, fileset, alternativeFilesetClose=False): """ __createSubscriptionsInWMBS_ Create subscriptions in WMBS for all the tasks in the spec. This includes filesets, workflows and the output map for each task. """ # create runtime sandbox for workflow self.createSandbox() # FIXME: Let workflow put in values if spec is missing them workflow = Workflow( spec=self.wmSpec.specUrl(), owner=self.wmSpec.getOwner()["name"], dn=self.wmSpec.getOwner().get("dn", "unknown"), group=self.wmSpec.getOwner().get("group", "unknown"), owner_vogroup=self.wmSpec.getOwner().get("vogroup", "DEFAULT"), owner_vorole=self.wmSpec.getOwner().get("vorole", "DEFAULT"), name=self.wmSpec.name(), task=task.getPathName(), wfType=self.wmSpec.getDashboardActivity(), alternativeFilesetClose=alternativeFilesetClose, priority=self.wmSpec.priority()) workflow.create() subscription = Subscription(fileset=fileset, workflow=workflow, split_algo=task.jobSplittingAlgorithm(), type=task.getPrimarySubType()) if subscription.exists(): subscription.load() msg = "Subscription %s already exists for %s (you may ignore file insertion messages below, existing files wont be duplicated)" self.logger.info(msg % (subscription['id'], task.getPathName())) else: subscription.create() for site in task.siteWhitelist(): subscription.addWhiteBlackList([{ "site_name": site, "valid": True }]) for site in task.siteBlacklist(): subscription.addWhiteBlackList([{ "site_name": site, "valid": False }]) if self.topLevelSubscription is None: self.topLevelSubscription = subscription logging.info("Top level subscription created: %s", subscription["id"]) else: logging.info("Child subscription created: %s", subscription["id"]) outputModules = task.getOutputModulesForTask() ignoredOutputModules = task.getIgnoredOutputModulesForTask() for outputModule in outputModules: for outputModuleName in outputModule.listSections_(): if outputModuleName in ignoredOutputModules: logging.info( "IgnoredOutputModule set for %s, skipping fileset creation.", outputModuleName) continue outputFileset = Fileset( self.outputFilesetName(task, outputModuleName)) outputFileset.create() outputFileset.markOpen(True) mergedOutputFileset = None for childTask in task.childTaskIterator(): if childTask.data.input.outputModule == outputModuleName: if childTask.taskType() == "Merge": mergedOutputFileset = Fileset( self.outputFilesetName(childTask, "Merged")) mergedOutputFileset.create() mergedOutputFileset.markOpen(True) primaryDataset = getattr( getattr(outputModule, outputModuleName), "primaryDataset", None) if primaryDataset != None: self.mergeOutputMapping[ mergedOutputFileset.id] = primaryDataset self._createSubscriptionsInWMBS( childTask, outputFileset, alternativeFilesetClose) if mergedOutputFileset is None: workflow.addOutput(outputModuleName, outputFileset, outputFileset) else: workflow.addOutput(outputModuleName, outputFileset, mergedOutputFileset) return self.topLevelSubscription
def killWorkflows(self, workflows): """ _killWorkflows_ Delete all the information in couch and WMBS about the given workflow, go through all subscriptions and delete one by one. The input is a dictionary with workflow names as keys, fully loaded WMWorkloads and subscriptions lists as values """ for workflow in workflows: logging.info("Deleting workflow %s" % workflow) try: #Get the task-workflow ids, sort them by ID, #higher ID first so we kill #the leaves of the tree first, root last workflowsIDs = workflows[workflow]["workflows"].keys() workflowsIDs.sort(reverse=True) #Now go through all tasks and load the WMBS workflow objects wmbsWorkflows = [] for wfID in workflowsIDs: wmbsWorkflow = Workflow(id=wfID) wmbsWorkflow.load() wmbsWorkflows.append(wmbsWorkflow) #Time to shoot one by one for wmbsWorkflow in wmbsWorkflows: if self.uploadPublishInfo: self.createAndUploadPublish(wmbsWorkflow) #Load all the associated subscriptions and shoot them one by one subIDs = workflows[workflow]["workflows"][wmbsWorkflow.id] for subID in subIDs: subscription = Subscription(id=subID) subscription['workflow'] = wmbsWorkflow subscription.load() subscription.deleteEverything() #Check that the workflow is gone if wmbsWorkflow.exists(): #Something went bad, this workflow #should be gone by now msg = "Workflow %s, Task %s was not deleted completely" % ( wmbsWorkflow.name, wmbsWorkflow.task) raise TaskArchiverPollerException(msg) #Now delete directories _, taskDir = getMasterName(startDir=self.jobCacheDir, workflow=wmbsWorkflow) logging.info("About to delete work directory %s" % taskDir) if os.path.exists(taskDir): if os.path.isdir(taskDir): shutil.rmtree(taskDir) else: # What we think of as a working directory is not a directory # This should never happen and there is no way we can recover # from this here. Bail out now and have someone look at things. msg = "Work directory is not a directory, this should never happen: %s" % taskDir raise TaskArchiverPollerException(msg) else: msg = "Attempted to delete work directory but it was already gone: %s" % taskDir logging.debug(msg) spec = workflows[workflow]["spec"] topTask = spec.getTopLevelTask()[0] # Now take care of the sandbox sandbox = getattr(topTask.data.input, 'sandbox', None) if sandbox: sandboxDir = os.path.dirname(sandbox) if os.path.isdir(sandboxDir): shutil.rmtree(sandboxDir) logging.debug("Sandbox dir deleted") else: logging.error( "Attempted to delete sandbox dir but it was already gone: %s" % sandboxDir) except Exception, ex: msg = "Critical error while deleting workflow %s\n" % workflow msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) self.sendAlert(2, msg=msg)
class JobGroup(WMBSBase, WMJobGroup): """ A group (set) of Jobs """ def __init__(self, subscription = None, jobs = None, id = -1, uid = None, location = None): WMBSBase.__init__(self) WMJobGroup.__init__(self, subscription=subscription, jobs = jobs) self.id = id self.lastUpdate = None self.uid = uid if location != None: self.setSite(location) return def create(self): """ Add the new jobgroup to WMBS, create the output Fileset object """ myThread = threading.currentThread() existingTransaction = self.beginTransaction() #overwrite base class self.output for WMBS fileset self.output = Fileset(name = makeUUID()) self.output.create() if self.uid == None: self.uid = makeUUID() action = self.daofactory(classname = "JobGroup.New") action.execute(self.uid, self.subscription["id"], self.output.id, conn = self.getDBConn(), transaction = self.existingTransaction()) self.id = self.exists() self.commitTransaction(existingTransaction) return def delete(self): """ Remove a jobgroup from WMBS """ deleteAction = self.daofactory(classname = "JobGroup.Delete") deleteAction.execute(id = self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) return def exists(self): """ Does a jobgroup exist with id if id is not provided, use the uid, return the id """ if self.id != -1: action = self.daofactory(classname = "JobGroup.ExistsByID") result = action.execute(id = self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) else: action = self.daofactory(classname = "JobGroup.Exists") result = action.execute(uid = self.uid, conn = self.getDBConn(), transaction = self.existingTransaction()) return result def load(self): """ _load_ Load all meta data associated with the JobGroup. This includes the JobGroup id, uid, last_update time, subscription id and output fileset id. Either the JobGroup id or uid must be specified for this to work. """ existingTransaction = self.beginTransaction() if self.id > 0: loadAction = self.daofactory(classname = "JobGroup.LoadFromID") result = loadAction.execute(self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) else: loadAction = self.daofactory(classname = "JobGroup.LoadFromUID") result = loadAction.execute(self.uid, conn = self.getDBConn(), transaction = self.existingTransaction()) self.id = result["id"] self.uid = result["uid"] self.lastUpdate = result["last_update"] self.subscription = Subscription(id = result["subscription"]) self.subscription.load() self.output = Fileset(id = result["output"]) self.output.load() self.jobs = [] self.commitTransaction(existingTransaction) return def loadData(self): """ _loadData_ Load all data that is associated with the jobgroup. This includes loading all the subscription information, the output fileset information and all the jobs that are associated with the group. """ existingTransaction = self.beginTransaction() if self.id < 0 or self.uid == None: self.load() self.subscription.loadData() self.output.loadData() loadAction = self.daofactory(classname = "JobGroup.LoadJobs") result = loadAction.execute(self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobs = [] self.newjobs = [] for jobID in result: newJob = Job(id = jobID["id"]) newJob.loadData() self.add(newJob) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def commit(self): """ _commit_ Write any new jobs to the database, creating them in the database if necessary. """ existingTransaction = self.beginTransaction() if self.id == -1: self.create() for j in self.newjobs: j.create(group = self) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def setSite(self, site_name = None): """ Updates the jobGroup with a site_name from the wmbs_location table """ if not self.exists(): return action = self.daofactory(classname = "JobGroup.SetSite") result = action.execute(site_name = site_name, jobGroupID = self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) return result def getSite(self): """ Updates the jobGroup with a site_name from the wmbs_location table """ if not self.exists(): return action = self.daofactory(classname = "JobGroup.GetSite") result = action.execute(jobGroupID = self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) return result def listJobIDs(self): """ Returns a list of job IDs Useful for times when threading the loading of jobGroups, where running loadData can overload UUID """ existingTransaction = self.beginTransaction() if self.id < 0 or self.uid == None: self.load() loadAction = self.daofactory(classname = "JobGroup.LoadJobs") result = loadAction.execute(self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) jobIDList = [] for jobID in result: jobIDList.append(jobID["id"]) self.commitTransaction(existingTransaction) return jobIDList def commitBulk(self): """ Creates jobs in a group instead of singly, as is done in jobGroup.commit() """ myThread = threading.currentThread() if self.id == -1: myThread.transaction.begin() #existingTransaction = self.beginTransaction() self.create() #self.commitTransaction(existingTransaction) myThread.transaction.commit() existingTransaction = self.beginTransaction() listOfJobs = [] for job in self.newjobs: #First do all the header stuff if job["id"] != None: continue job["jobgroup"] = self.id if job["name"] == None: job["name"] = makeUUID() listOfJobs.append(job) bulkAction = self.daofactory(classname = "Jobs.New") result = bulkAction.execute(jobList = listOfJobs) #Use the results of the bulk commit to get the jobIDs fileDict = {} for job in listOfJobs: job['id'] = result[job['name']] fileDict[job['id']] = [] for file in job['input_files']: fileDict[job['id']].append(file['id']) maskAction = self.daofactory(classname = "Masks.New") maskAction.execute(jobList = listOfJobs, conn = self.getDBConn(), \ transaction = self.existingTransaction()) fileAction = self.daofactory(classname = "Jobs.AddFiles") fileAction.execute(jobDict = fileDict, conn = self.getDBConn(), \ transaction = self.existingTransaction()) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def getLocationsForJobs(self): """ Gets a list of the locations that jobs can run at """ if not self.exists(): return action = self.daofactory(classname = "JobGroup.GetLocationsForJobs") result = action.execute(id = self.id, conn = self.getDBConn(), transaction = self.existingTransaction()) return result def __str__(self): """ __str__ Print out some information about the jobGroup as if jobGroup inherited from dict() """ d = {'id': self.id, 'uid': self.uid, 'subscription': self.subscription, 'output': self.output, 'jobs': self.jobs, 'newjobs': self.newjobs} return str(d)
def pollSubscriptions(self): """ Poller for looking in all active subscriptions for jobs that need to be made. """ logging.info("Beginning JobCreator.pollSubscriptions() cycle.") myThread = threading.currentThread() #First, get list of Subscriptions subscriptions = self.subscriptionList.execute() # Okay, now we have a list of subscriptions for subscriptionID in subscriptions: wmbsSubscription = Subscription(id=subscriptionID) try: wmbsSubscription.load() except IndexError: # This happens when the subscription no longer exists # i.e., someone executed a kill() function on the database # while the JobCreator was in cycle # Ignore this subscription msg = "JobCreator cannot load subscription %i" % subscriptionID logging.error(msg) self.sendAlert(6, msg=msg) continue workflow = Workflow(id=wmbsSubscription["workflow"].id) workflow.load() wmbsSubscription['workflow'] = workflow wmWorkload = retrieveWMSpec(workflow=workflow) if not workflow.task or not wmWorkload: # Then we have a problem # We NEED a sandbox # Abort this subscription! # But do NOT fail # We have no way of marking a subscription as bad per se # We'll have to just keep skipping it msg = "Have no task for workflow %i\n" % (workflow.id) msg += "Aborting Subscription %i" % (subscriptionID) logging.error(msg) self.sendAlert(1, msg=msg) continue logging.debug("Have loaded subscription %i with workflow %i\n" % (subscriptionID, workflow.id)) # Set task object wmTask = wmWorkload.getTaskByPath(workflow.task) # Get generators # If you fail to load the generators, pass on the job try: if hasattr(wmTask.data, 'generators'): manager = GeneratorManager(wmTask) seederList = manager.getGeneratorList() else: seederList = [] except Exception, ex: msg = "Had failure loading generators for subscription %i\n" % ( subscriptionID) msg += "Exception: %s\n" % str(ex) msg += "Passing over this error. It will reoccur next interation!\n" msg += "Please check or remove this subscription!\n" logging.error(msg) self.sendAlert(6, msg=msg) continue logging.debug( "Going to call wmbsJobFactory for sub %i with limit %i" % (subscriptionID, self.limit)) splitParams = retrieveJobSplitParams(wmWorkload, workflow.task) logging.debug("Split Params: %s" % splitParams) # My hope is that the job factory is smart enough only to split un-split jobs splitterFactory = SplitterFactory( splitParams.get('algo_package', "WMCore.JobSplitting")) wmbsJobFactory = splitterFactory(package="WMCore.WMBS", subscription=wmbsSubscription, generators=seederList, limit=self.limit) # Turn on the jobFactory wmbsJobFactory.open() # Create a function to hold it jobSplittingFunction = runSplitter(jobFactory=wmbsJobFactory, splitParams=splitParams) # Now we get to find out how many jobs there are. jobNumber = self.countJobs.execute(workflow=workflow.id, conn=myThread.transaction.conn, transaction=True) jobNumber += splitParams.get('initial_lfn_counter', 0) logging.debug("Have %i jobs for workflow %s already in database." % (jobNumber, workflow.name)) continueSubscription = True while continueSubscription: # This loop runs over the jobFactory, # using yield statements and a pre-existing proxy to # generate and process new jobs # First we need the jobs. myThread.transaction.begin() try: wmbsJobGroups = jobSplittingFunction.next() logging.info("Retrieved %i jobGroups from jobSplitter" % (len(wmbsJobGroups))) except StopIteration: # If you receive a stopIteration, we're done logging.info("Completed iteration over subscription %i" % (subscriptionID)) continueSubscription = False myThread.transaction.commit() break # If we have no jobGroups, we're done if len(wmbsJobGroups) == 0: logging.info( "Found end in iteration over subscription %i" % (subscriptionID)) continueSubscription = False myThread.transaction.commit() break # Assemble a dict of all the info processDict = { 'workflow': workflow, 'wmWorkload': wmWorkload, 'wmTaskName': wmTask.getPathName(), 'jobNumber': jobNumber, 'sandbox': wmTask.data.input.sandbox, 'wmTaskPrio': wmTask.getTaskPriority(), 'owner': wmWorkload.getOwner().get('name', None), 'ownerDN': wmWorkload.getOwner().get('dn', None), 'ownerGroup': wmWorkload.getOwner().get('vogroup', ''), 'ownerRole': wmWorkload.getOwner().get('vorole', '') } tempSubscription = Subscription(id=wmbsSubscription['id']) nameDictList = [] for wmbsJobGroup in wmbsJobGroups: # For each jobGroup, put a dictionary # together and run it with creatorProcess jobsInGroup = len(wmbsJobGroup.jobs) wmbsJobGroup.subscription = tempSubscription tempDict = {} tempDict.update(processDict) tempDict['jobGroup'] = wmbsJobGroup tempDict['swVersion'] = wmTask.getSwVersion() tempDict['scramArch'] = wmTask.getScramArch() tempDict['jobNumber'] = jobNumber tempDict['agentNumber'] = self.agentNumber jobGroup = creatorProcess(work=tempDict, jobCacheDir=self.jobCacheDir) jobNumber += jobsInGroup # Set jobCache for group for job in jobGroup.jobs: nameDictList.append({ 'jobid': job['id'], 'cacheDir': job['cache_dir'] }) job["user"] = wmWorkload.getOwner()["name"] job["group"] = wmWorkload.getOwner()["group"] # Set the caches in the database try: if len(nameDictList) > 0: self.setBulkCache.execute( jobDictList=nameDictList, conn=myThread.transaction.conn, transaction=True) except WMException: raise except Exception, ex: msg = "Unknown exception while setting the bulk cache:\n" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) logging.debug( "Error while setting bulkCache with following values: %s\n" % nameDictList) raise JobCreatorException(msg) # Advance the jobGroup in changeState for wmbsJobGroup in wmbsJobGroups: self.advanceJobGroup(wmbsJobGroup=wmbsJobGroup) # Now end the transaction so that everything is wrapped # in a single rollback myThread.transaction.commit()
def __call__(self, parameters): """ Poller for looking in all active subscriptions for jobs that need to be made. """ logging.info("In JobCreatorWorker.__call__") myThread = threading.currentThread() for entry in parameters: # This retrieves a single subscription subscriptionID = entry.get('subscription') if subscriptionID < 0: logging.error("Got non-existant subscription") logging.error("Assuming parameters in error: returning") return subscriptionID myThread.transaction.begin() logging.info("About to call subscription %i" %subscriptionID) wmbsSubscription = Subscription(id = subscriptionID) wmbsSubscription.load() wmbsSubscription["workflow"].load() workflow = wmbsSubscription["workflow"] wmWorkload = retrieveWMSpec(wmbsSubscription) if not workflow.task or not wmWorkload: # Then we have a problem # We have no sandbox # We NEED a sandbox # Abort this subscription! # But do NOT fail # We have no way of marking a subscription as bad per se # We'll have to just keep skipping it wmTask = None seederList = [] logging.error("Have no task for workflow %i" % (workflow.id)) logging.error("Aborting Subscription %i" % (subscriptionID)) continue else: wmTask = wmWorkload.getTaskByPath(workflow.task) if hasattr(wmTask.data, 'seeders'): manager = SeederManager(wmTask) seederList = manager.getSeederList() else: seederList = [] logging.info("About to enter JobFactory") logging.debug("Going to call wmbsJobFactory with limit %i" % (self.limit)) # My hope is that the job factory is smart enough only to split un-split jobs wmbsJobFactory = self.splitterFactory(package = "WMCore.WMBS", subscription = wmbsSubscription, generators=seederList, limit = self.limit) splitParams = retrieveJobSplitParams(wmWorkload, workflow.task) logging.debug("Split Params: %s" % splitParams) continueSubscription = True myThread.transaction.commit() # Turn on the jobFactory myThread.transaction.begin() wmbsJobFactory.open() # Create a function to hold it jobSplittingFunction = runSplitter(jobFactory = wmbsJobFactory, splitParams = splitParams) while continueSubscription: # This loop runs over the jobFactory, # using yield statements and a pre-existing proxy to # generate and process new jobs # First we need the jobs. try: wmbsJobGroups = jobSplittingFunction.next() logging.info("Retrieved %i jobGroups from jobSplitter" % (len(wmbsJobGroups))) except StopIteration: # If you receive a stopIteration, we're done logging.info("Completed iteration over subscription %i" % (subscriptionID)) continueSubscription = False continue # Now we get to find out what job they are. countJobs = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow") jobNumber = countJobs.execute(workflow = workflow.id, conn = myThread.transaction.conn, transaction = True) logging.debug("Have %i jobs for this workflow already" % (jobNumber)) for wmbsJobGroup in wmbsJobGroups: logging.debug("Processing jobGroup %i" % (wmbsJobGroup.exists())) logging.debug("Processing %i jobs" % (len(wmbsJobGroup.jobs)) ) # Create a directory self.createWorkArea.processJobs(jobGroup = wmbsJobGroup, startDir = self.jobCacheDir, workflow = workflow, wmWorkload = wmWorkload, transaction = myThread.transaction, conn = myThread.transaction.conn) for job in wmbsJobGroup.jobs: jobNumber += 1 self.saveJob(job = job, workflow = workflow, wmTask = wmTask, jobNumber = jobNumber) self.advanceJobGroup(wmbsJobGroup) logging.debug("Finished call for jobGroup %i" \ % (wmbsJobGroup.exists())) # END: while loop over jobSplitter myThread.transaction.commit() # About to reset everything wmbsJobGroups = None wmTask = None wmWorkload = None splitParams = None wmbsJobFactory = None gc.collect() # About to check memory doMemoryCheck("About to get memory references: End of subscription loop") # Final memory check doMemoryCheck("About to get memory references: End of __call__()") logging.debug("About to return from JobCreatorWorker.__call__()") return parameters
def setupPromptRecoWorkflow(self): """ _setupPromptRecoWorkflow_ Populate WMBS with a real PromptReco workflow, every subscription must be unfinished at first """ # Populate disk and WMBS testArguments = PromptRecoWorkloadFactory.getTestArguments() workflowName = 'PromptReco_Run195360_Cosmics' factory = PromptRecoWorkloadFactory() testArguments["EnableHarvesting"] = True testArguments["CouchURL"] = os.environ["COUCHURL"] workload = factory.factoryWorkloadConstruction(workflowName, testArguments) wmbsHelper = WMBSHelper(workload, 'Reco', 'SomeBlock', cachepath=self.testDir) wmbsHelper.createTopLevelFileset() wmbsHelper._createSubscriptionsInWMBS(wmbsHelper.topLevelTask, wmbsHelper.topLevelFileset) self.stateMap = { 'AlcaSkim': [], 'Merge': [], 'Harvesting': [], 'Processing Done': [] } self.orderedStates = [ 'AlcaSkim', 'Merge', 'Harvesting', 'Processing Done' ] # Populate WMStats self.requestDBWriter.insertGenericRequest( {'RequestName': workflowName}) self.requestDBWriter.updateRequestStatus(workflowName, 'Closed') topLevelTask = '/%s/Reco' % workflowName alcaSkimTask = '%s/AlcaSkim' % topLevelTask mergeTasks = [ '%s/AlcaSkim/AlcaSkimMergeALCARECOStreamHcalCalHOCosmics', '%s/AlcaSkim/AlcaSkimMergeALCARECOStreamTkAlCosmics0T', '%s/AlcaSkim/AlcaSkimMergeALCARECOStreamMuAlGlobalCosmics', '%s/RecoMergewrite_AOD', '%s/RecoMergewrite_DQM', '%s/RecoMergewrite_RECO' ] harvestingTask = '%s/RecoMergewrite_DQM/RecoMergewrite_DQMEndOfRunDQMHarvestMerged' % topLevelTask self.stateMap['AlcaSkim'].append(wmbsHelper.topLevelSubscription) alcaSkimWorkflow = Workflow(name=workflowName, task=alcaSkimTask) alcaSkimWorkflow.load() alcarecoFileset = Fileset( name= '/PromptReco_Run195360_Cosmics/Reco/unmerged-write_ALCARECOALCARECO' ) alcarecoFileset.load() alcaSkimSub = Subscription(alcarecoFileset, alcaSkimWorkflow) alcaSkimSub.load() self.stateMap['Merge'].append(alcaSkimSub) for task in mergeTasks: mergeTask = task % topLevelTask mergeWorkflow = Workflow(name=workflowName, task=mergeTask) mergeWorkflow.load() if 'AlcaSkim' in mergeTask: stream = mergeTask.split('/')[-1][13:] unmergedFileset = Fileset(name='%s/unmerged-%sALCARECO' % (alcaSkimTask, stream)) unmergedFileset.load() else: dataTier = mergeTask.split('/')[-1].split('_')[-1] unmergedFileset = Fileset(name='%s/unmerged-write_%s%s' % (topLevelTask, dataTier, dataTier)) unmergedFileset.load() mergeSub = Subscription(unmergedFileset, mergeWorkflow) mergeSub.load() self.stateMap['Harvesting'].append(mergeSub) harvestingWorkflow = Workflow(name=workflowName, task=harvestingTask) harvestingWorkflow.load() harvestingFileset = Fileset( name= '/PromptReco_Run195360_Cosmics/Reco/RecoMergewrite_DQM/merged-MergedDQM' ) harvestingFileset.load() harvestingSub = Subscription(harvestingFileset, harvestingWorkflow) harvestingSub.load() self.stateMap['Processing Done'].append(harvestingSub) return
class JobGroup(WMBSBase, WMJobGroup): """ A group (set) of Jobs """ def __init__(self, subscription=None, jobs=None, id=-1, uid=None, location=None): WMBSBase.__init__(self) WMJobGroup.__init__(self, subscription=subscription, jobs=jobs) self.id = id self.lastUpdate = None self.uid = uid if location != None: self.setSite(location) return def create(self): """ Add the new jobgroup to WMBS, create the output Fileset object """ myThread = threading.currentThread() existingTransaction = self.beginTransaction() #overwrite base class self.output for WMBS fileset self.output = Fileset(name=makeUUID()) self.output.create() if self.uid == None: self.uid = makeUUID() action = self.daofactory(classname="JobGroup.New") action.execute(self.uid, self.subscription["id"], self.output.id, conn=self.getDBConn(), transaction=self.existingTransaction()) self.id = self.exists() self.commitTransaction(existingTransaction) return def delete(self): """ Remove a jobgroup from WMBS """ deleteAction = self.daofactory(classname="JobGroup.Delete") deleteAction.execute(id=self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) return def exists(self): """ Does a jobgroup exist with id if id is not provided, use the uid, return the id """ if self.id != -1: action = self.daofactory(classname="JobGroup.ExistsByID") result = action.execute(id=self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) else: action = self.daofactory(classname="JobGroup.Exists") result = action.execute(uid=self.uid, conn=self.getDBConn(), transaction=self.existingTransaction()) return result def load(self): """ _load_ Load all meta data associated with the JobGroup. This includes the JobGroup id, uid, last_update time, subscription id and output fileset id. Either the JobGroup id or uid must be specified for this to work. """ existingTransaction = self.beginTransaction() if self.id > 0: loadAction = self.daofactory(classname="JobGroup.LoadFromID") result = loadAction.execute(self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) else: loadAction = self.daofactory(classname="JobGroup.LoadFromUID") result = loadAction.execute(self.uid, conn=self.getDBConn(), transaction=self.existingTransaction()) self.id = result["id"] self.uid = result["uid"] self.lastUpdate = result["last_update"] self.subscription = Subscription(id=result["subscription"]) self.subscription.load() self.output = Fileset(id=result["output"]) self.output.load() self.jobs = [] self.commitTransaction(existingTransaction) return def loadData(self): """ _loadData_ Load all data that is associated with the jobgroup. This includes loading all the subscription information, the output fileset information and all the jobs that are associated with the group. """ existingTransaction = self.beginTransaction() if self.id < 0 or self.uid == None: self.load() self.subscription.loadData() self.output.loadData() loadAction = self.daofactory(classname="JobGroup.LoadJobs") result = loadAction.execute(self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobs = [] self.newjobs = [] for jobID in result: newJob = Job(id=jobID["id"]) newJob.loadData() self.add(newJob) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def commit(self): """ _commit_ Write any new jobs to the database, creating them in the database if necessary. """ existingTransaction = self.beginTransaction() if self.id == -1: self.create() for j in self.newjobs: j.create(group=self) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def setSite(self, site_name=None): """ Updates the jobGroup with a site_name from the wmbs_location table """ if not self.exists(): return action = self.daofactory(classname="JobGroup.SetSite") result = action.execute(site_name=site_name, jobGroupID=self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) return result def getSite(self): """ Updates the jobGroup with a site_name from the wmbs_location table """ if not self.exists(): return action = self.daofactory(classname="JobGroup.GetSite") result = action.execute(jobGroupID=self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) return result def listJobIDs(self): """ Returns a list of job IDs Useful for times when threading the loading of jobGroups, where running loadData can overload UUID """ existingTransaction = self.beginTransaction() if self.id < 0 or self.uid == None: self.load() loadAction = self.daofactory(classname="JobGroup.LoadJobs") result = loadAction.execute(self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) jobIDList = [] for jobID in result: jobIDList.append(jobID["id"]) self.commitTransaction(existingTransaction) return jobIDList def commitBulk(self): """ Creates jobs in a group instead of singly, as is done in jobGroup.commit() """ myThread = threading.currentThread() if self.id == -1: myThread.transaction.begin() #existingTransaction = self.beginTransaction() self.create() #self.commitTransaction(existingTransaction) myThread.transaction.commit() existingTransaction = self.beginTransaction() listOfJobs = [] for job in self.newjobs: #First do all the header stuff if job["id"] != None: continue job["jobgroup"] = self.id if job["name"] == None: job["name"] = makeUUID() listOfJobs.append(job) bulkAction = self.daofactory(classname="Jobs.New") result = bulkAction.execute(jobList=listOfJobs) #Use the results of the bulk commit to get the jobIDs fileDict = {} for job in listOfJobs: job['id'] = result[job['name']] fileDict[job['id']] = [] for file in job['input_files']: fileDict[job['id']].append(file['id']) maskAction = self.daofactory(classname="Masks.New") maskAction.execute(jobList = listOfJobs, conn = self.getDBConn(), \ transaction = self.existingTransaction()) fileAction = self.daofactory(classname="Jobs.AddFiles") fileAction.execute(jobDict = fileDict, conn = self.getDBConn(), \ transaction = self.existingTransaction()) WMJobGroup.commit(self) self.commitTransaction(existingTransaction) return def getLocationsForJobs(self): """ Gets a list of the locations that jobs can run at """ if not self.exists(): return action = self.daofactory(classname="JobGroup.GetLocationsForJobs") result = action.execute(id=self.id, conn=self.getDBConn(), transaction=self.existingTransaction()) return result def __str__(self): """ __str__ Print out some information about the jobGroup as if jobGroup inherited from dict() """ d = { 'id': self.id, 'uid': self.uid, 'subscription': self.subscription, 'output': self.output, 'jobs': self.jobs, 'newjobs': self.newjobs } return str(d)
def databaseWork(self): """ Queries DB for all watched filesets, if a filesets matches become available, create the subscriptions """ # Get all watched workflows availableWorkflows = self.getUnsubscribedWorkflows.execute() logging.debug("Found %s unsubscribed managed workflows" \ % len(availableWorkflows)) # Get all filesets to check if they match a wrokflow availableFilesets = self.getAllFilesets.execute() logging.debug("Found %s filesets" % len(availableFilesets)) # Loop on unsubscribed workflows to match filesets for managedWorkflow in availableWorkflows: # Workflow object cache to pass into Subscription constructor wfObj = None for fileset in availableFilesets: # Fileset object cache fsObj = None # Load the location information #whitelist = Set() #blacklist = Set() # Location is only caf #locations = self.queries.getLocations(managedWorkflow['id']) #for location in locations: # if bool(int(location['valid'])) == True: # whitelist.add(location['site_name']) # else: # blacklist.add(location['site_name']) # Attempt to match workflows to filesets if re.match(managedWorkflow['fileset_match'], fileset['name']): # Log in debug msg = "Creating subscription for %s to workflow id %s" msg %= (fileset['name'], managedWorkflow['workflow']) logging.debug(msg) # Match found - Load the fileset if not already loaded if not fsObj: fsObj = Fileset(id=fileset['id']) fsObj.load() # Load the workflow if not already loaded if not wfObj: wfObj = Workflow(id=managedWorkflow['workflow']) wfObj.load() # Create the subscription newSub = Subscription(fileset = fsObj, \ workflow = wfObj, \ #whitelist = whitelist, \ #blacklist = blacklist, \ split_algo = managedWorkflow['split_algo'], type = managedWorkflow['type']) newSub.create() managedWorkflows = self.getManagedWorkflows.execute() logging.debug("Found %s managed workflows" \ % len(managedWorkflows)) unsubscribedFilesets = self.getUnsubscribedFilesets.execute() logging.debug("Found %s unsubscribed filesets" % \ len(unsubscribedFilesets)) # Loop on unsubscribed filesets to match workflows for unsubscribedFileset in unsubscribedFilesets: # Workflow object cache to pass into Subscription constructor # FIXME wfObj = None for managedWork in managedWorkflows: logging.debug("The workflow %s" % managedWork['workflow']) # Fileset object cache wfObj = None fsObj = None # Load the location information #whitelist = Set() #blacklist = Set() # Location is only caf #locations = self.queries.getLocations(managedWorkflow['id']) #for location in locations: # if bool(int(location['valid'])) == True: # whitelist.add(location['site_name']) # else: # blacklist.add(location['site_name']) # Attempt to match workflows to filesets if re.match(managedWork['fileset_match'], \ unsubscribedFileset['name']): # Log in debug msg = "Creating subscription for %s to workflow id %s" msg %= (unsubscribedFileset['name'], \ managedWork['workflow']) logging.debug(msg) # Match found - Load the fileset if not already loaded if not fsObj: fsObj = Fileset(id=unsubscribedFileset['id']) fsObj.load() # Load the workflow if not already loaded if not wfObj: wfObj = Workflow(id=managedWork['workflow']) wfObj.load() # Create the subscription newSub = Subscription(fileset = fsObj, \ workflow = wfObj, \ #whitelist = whitelist, \ #blacklist = blacklist, \ split_algo = managedWork['split_algo'], type = managedWork['type']) newSub.create() newSub.load()
def _createSubscriptionsInWMBS(self, task, fileset, alternativeFilesetClose = False): """ __createSubscriptionsInWMBS_ Create subscriptions in WMBS for all the tasks in the spec. This includes filesets, workflows and the output map for each task. """ # create runtime sandbox for workflow self.createSandbox() #FIXME: Let workflow put in values if spec is missing them workflow = Workflow(spec = self.wmSpec.specUrl(), owner = self.wmSpec.getOwner()["name"], dn = self.wmSpec.getOwner().get("dn", "unknown"), group = self.wmSpec.getOwner().get("group", "unknown"), owner_vogroup = self.wmSpec.getOwner().get("vogroup", "DEFAULT"), owner_vorole = self.wmSpec.getOwner().get("vorole", "DEFAULT"), name = self.wmSpec.name(), task = task.getPathName(), wfType = self.wmSpec.getDashboardActivity(), alternativeFilesetClose = alternativeFilesetClose, priority = self.wmSpec.priority()) workflow.create() subscription = Subscription(fileset = fileset, workflow = workflow, split_algo = task.jobSplittingAlgorithm(), type = task.getPrimarySubType()) if subscription.exists(): subscription.load() msg = "Subscription %s already exists for %s (you may ignore file insertion messages below, existing files wont be duplicated)" self.logger.info(msg % (subscription['id'], task.getPathName())) else: subscription.create() for site in task.siteWhitelist(): subscription.addWhiteBlackList([{"site_name": site, "valid": True}]) for site in task.siteBlacklist(): subscription.addWhiteBlackList([{"site_name": site, "valid": False}]) if self.topLevelSubscription == None: self.topLevelSubscription = subscription logging.info("Top level subscription created: %s" % subscription["id"]) else: logging.info("Child subscription created: %s" % subscription["id"]) outputModules = task.getOutputModulesForTask() for outputModule in outputModules: for outputModuleName in outputModule.listSections_(): outputFileset = Fileset(self.outputFilesetName(task, outputModuleName)) outputFileset.create() outputFileset.markOpen(True) mergedOutputFileset = None for childTask in task.childTaskIterator(): if childTask.data.input.outputModule == outputModuleName: if childTask.taskType() == "Merge": mergedOutputFileset = Fileset(self.outputFilesetName(childTask, "Merged")) mergedOutputFileset.create() mergedOutputFileset.markOpen(True) primaryDataset = getattr(getattr(outputModule, outputModuleName), "primaryDataset", None) if primaryDataset != None: self.mergeOutputMapping[mergedOutputFileset.id] = primaryDataset self._createSubscriptionsInWMBS(childTask, outputFileset, alternativeFilesetClose) if mergedOutputFileset == None: workflow.addOutput(outputModuleName, outputFileset, outputFileset) else: workflow.addOutput(outputModuleName, outputFileset, mergedOutputFileset) return self.topLevelSubscription
def databaseWork(self): """ Queries DB for all watched filesets, if a filesets matches become available, create the subscriptions """ # Get all watched workflows availableWorkflows = self.getUnsubscribedWorkflows.execute() logging.debug("Found %s unsubscribed managed workflows" \ % len(availableWorkflows)) # Get all filesets to check if they match a wrokflow availableFilesets = self.getAllFilesets.execute() logging.debug("Found %s filesets" % len(availableFilesets)) # Loop on unsubscribed workflows to match filesets for managedWorkflow in availableWorkflows: # Workflow object cache to pass into Subscription constructor wfObj = None for fileset in availableFilesets: # Fileset object cache fsObj = None # Load the location information #whitelist = Set() #blacklist = Set() # Location is only caf #locations = self.queries.getLocations(managedWorkflow['id']) #for location in locations: # if bool(int(location['valid'])) == True: # whitelist.add(location['site_name']) # else: # blacklist.add(location['site_name']) # Attempt to match workflows to filesets if re.match(managedWorkflow['fileset_match'], fileset['name']): # Log in debug msg = "Creating subscription for %s to workflow id %s" msg %= (fileset['name'], managedWorkflow['workflow']) logging.debug(msg) # Match found - Load the fileset if not already loaded if not fsObj: fsObj = Fileset(id = fileset['id']) fsObj.load() # Load the workflow if not already loaded if not wfObj: wfObj = Workflow(id = managedWorkflow['workflow']) wfObj.load() # Create the subscription newSub = Subscription(fileset = fsObj, \ workflow = wfObj, \ #whitelist = whitelist, \ #blacklist = blacklist, \ split_algo = managedWorkflow['split_algo'], type = managedWorkflow['type']) newSub.create() managedWorkflows = self.getManagedWorkflows.execute() logging.debug("Found %s managed workflows" \ % len(managedWorkflows)) unsubscribedFilesets = self.getUnsubscribedFilesets.execute() logging.debug("Found %s unsubscribed filesets" % \ len(unsubscribedFilesets)) # Loop on unsubscribed filesets to match workflows for unsubscribedFileset in unsubscribedFilesets: # Workflow object cache to pass into Subscription constructor # FIXME wfObj = None for managedWork in managedWorkflows: logging.debug("The workflow %s" %managedWork['workflow']) # Fileset object cache wfObj = None fsObj = None # Load the location information #whitelist = Set() #blacklist = Set() # Location is only caf #locations = self.queries.getLocations(managedWorkflow['id']) #for location in locations: # if bool(int(location['valid'])) == True: # whitelist.add(location['site_name']) # else: # blacklist.add(location['site_name']) # Attempt to match workflows to filesets if re.match(managedWork['fileset_match'], \ unsubscribedFileset['name']): # Log in debug msg = "Creating subscription for %s to workflow id %s" msg %= (unsubscribedFileset['name'], \ managedWork['workflow']) logging.debug(msg) # Match found - Load the fileset if not already loaded if not fsObj: fsObj = Fileset(id = unsubscribedFileset['id']) fsObj.load() # Load the workflow if not already loaded if not wfObj: wfObj = Workflow(id = managedWork['workflow']) wfObj.load() # Create the subscription newSub = Subscription(fileset = fsObj, \ workflow = wfObj, \ #whitelist = whitelist, \ #blacklist = blacklist, \ split_algo = managedWork['split_algo'], type = managedWork['type']) newSub.create() newSub.load()
def pollSubscriptions(self): """ Poller for looking in all active subscriptions for jobs that need to be made. """ logging.info("Beginning JobCreator.pollSubscriptions() cycle.") myThread = threading.currentThread() # First, get list of Subscriptions subscriptions = self.subscriptionList.execute() # Okay, now we have a list of subscriptions for subscriptionID in subscriptions: wmbsSubscription = Subscription(id=subscriptionID) try: wmbsSubscription.load() except IndexError: # This happens when the subscription no longer exists # i.e., someone executed a kill() function on the database # while the JobCreator was in cycle # Ignore this subscription msg = "JobCreator cannot load subscription %i" % subscriptionID logging.error(msg) continue workflow = Workflow(id=wmbsSubscription["workflow"].id) workflow.load() wmbsSubscription['workflow'] = workflow wmWorkload = retrieveWMSpec(workflow=workflow) if not workflow.task or not wmWorkload: # Then we have a problem # We NEED a sandbox # Abort this subscription! # But do NOT fail # We have no way of marking a subscription as bad per se # We'll have to just keep skipping it msg = "Have no task for workflow %i\n" % (workflow.id) msg += "Aborting Subscription %i" % (subscriptionID) logging.error(msg) continue logging.debug("Have loaded subscription %i with workflow %i\n", subscriptionID, workflow.id) # retrieve information from the workload to propagate down to the job configuration allowOpport = wmWorkload.getAllowOpportunistic() # Set task object wmTask = wmWorkload.getTaskByPath(workflow.task) # Get generators # If you fail to load the generators, pass on the job try: if hasattr(wmTask.data, 'generators'): manager = GeneratorManager(wmTask) seederList = manager.getGeneratorList() else: seederList = [] except Exception as ex: msg = "Had failure loading generators for subscription %i\n" % (subscriptionID) msg += "Exception: %s\n" % str(ex) msg += "Passing over this error. It will reoccur next interation!\n" msg += "Please check or remove this subscription!\n" logging.error(msg) continue logging.debug("Going to call wmbsJobFactory for sub %i with limit %i", subscriptionID, self.limit) splitParams = retrieveJobSplitParams(wmWorkload, workflow.task) logging.debug("Split Params: %s", splitParams) # Load the proper job splitting module splitterFactory = SplitterFactory(splitParams.get('algo_package', "WMCore.JobSplitting")) # and return an instance of the splitting algorithm wmbsJobFactory = splitterFactory(package="WMCore.WMBS", subscription=wmbsSubscription, generators=seederList, limit=self.limit) # Turn on the jobFactory --> get available files for that subscription, keep result proxies wmbsJobFactory.open() # Create a function to hold it, calling __call__ from the JobFactory # which then calls algorithm method of the job splitting algo instance jobSplittingFunction = runSplitter(jobFactory=wmbsJobFactory, splitParams=splitParams) # Now we get to find out how many jobs there are. jobNumber = self.countJobs.execute(workflow=workflow.id, conn=myThread.transaction.conn, transaction=True) jobNumber += splitParams.get('initial_lfn_counter', 0) logging.debug("Have %i jobs for workflow %s already in database.", jobNumber, workflow.name) continueSubscription = True while continueSubscription: # This loop runs over the jobFactory, # using yield statements and a pre-existing proxy to # generate and process new jobs # First we need the jobs. myThread.transaction.begin() try: wmbsJobGroups = next(jobSplittingFunction) logging.info("Retrieved %i jobGroups from jobSplitter", len(wmbsJobGroups)) except StopIteration: # If you receive a stopIteration, we're done logging.info("Completed iteration over subscription %i", subscriptionID) continueSubscription = False myThread.transaction.commit() break # If we have no jobGroups, we're done if len(wmbsJobGroups) == 0: logging.info("Found end in iteration over subscription %i", subscriptionID) continueSubscription = False myThread.transaction.commit() break # Assemble a dict of all the info processDict = {'workflow': workflow, 'wmWorkload': wmWorkload, 'wmTaskName': wmTask.getPathName(), 'jobNumber': jobNumber, 'sandbox': wmTask.data.input.sandbox, 'owner': wmWorkload.getOwner().get('name', None), 'ownerDN': wmWorkload.getOwner().get('dn', None), 'ownerGroup': wmWorkload.getOwner().get('vogroup', ''), 'ownerRole': wmWorkload.getOwner().get('vorole', ''), 'numberOfCores': 1, 'inputDataset': wmTask.getInputDatasetPath(), 'inputPileup': wmTask.getInputPileupDatasets()} try: maxCores = 1 stepNames = wmTask.listAllStepNames() for stepName in stepNames: sh = wmTask.getStep(stepName) maxCores = max(maxCores, sh.getNumberOfCores()) processDict.update({'numberOfCores': maxCores}) except AttributeError: logging.info("Failed to read multicore settings from task %s", wmTask.getPathName()) tempSubscription = Subscription(id=wmbsSubscription['id']) # if we have glideinWMS constraints, then adapt all jobs if self.glideinLimits: capResourceEstimates(wmbsJobGroups, self.glideinLimits) nameDictList = [] for wmbsJobGroup in wmbsJobGroups: # For each jobGroup, put a dictionary # together and run it with creatorProcess jobsInGroup = len(wmbsJobGroup.jobs) wmbsJobGroup.subscription = tempSubscription tempDict = {} tempDict.update(processDict) tempDict['jobGroup'] = wmbsJobGroup tempDict['swVersion'] = wmTask.getSwVersion(allSteps=True) tempDict['scramArch'] = wmTask.getScramArch() tempDict['jobNumber'] = jobNumber tempDict['agentNumber'] = self.agentNumber tempDict['agentName'] = self.agentName tempDict['inputDatasetLocations'] = wmbsJobGroup.getLocationsForJobs() tempDict['allowOpportunistic'] = allowOpport jobGroup = creatorProcess(work=tempDict, jobCacheDir=self.jobCacheDir) jobNumber += jobsInGroup # Set jobCache for group for job in jobGroup.jobs: nameDictList.append({'jobid': job['id'], 'cacheDir': job['cache_dir']}) job["user"] = wmWorkload.getOwner()["name"] job["group"] = wmWorkload.getOwner()["group"] # Set the caches in the database try: if len(nameDictList) > 0: self.setBulkCache.execute(jobDictList=nameDictList, conn=myThread.transaction.conn, transaction=True) except WMException: raise except Exception as ex: msg = "Unknown exception while setting the bulk cache:\n" msg += str(ex) logging.error(msg) logging.debug("Error while setting bulkCache with following values: %s\n", nameDictList) raise JobCreatorException(msg) # Advance the jobGroup in changeState for wmbsJobGroup in wmbsJobGroups: self.advanceJobGroup(wmbsJobGroup=wmbsJobGroup) # Now end the transaction so that everything is wrapped # in a single rollback myThread.transaction.commit() # END: While loop over jobFactory # Close the jobFactory wmbsJobFactory.close() return
def killWorkflows(self, workflows): """ _killWorkflows_ Delete all the information in couch and WMBS about the given workflow, go through all subscriptions and delete one by one. The input is a dictionary with workflow names as keys, fully loaded WMWorkloads and subscriptions lists as values """ for workflow in workflows: logging.info("Deleting workflow %s" % workflow) try: #Get the task-workflow ids, sort them by ID, #higher ID first so we kill #the leaves of the tree first, root last workflowsIDs = workflows[workflow]["workflows"].keys() workflowsIDs.sort(reverse = True) #Now go through all tasks and load the WMBS workflow objects wmbsWorkflows = [] for wfID in workflowsIDs: wmbsWorkflow = Workflow(id = wfID) wmbsWorkflow.load() wmbsWorkflows.append(wmbsWorkflow) #Time to shoot one by one for wmbsWorkflow in wmbsWorkflows: if self.uploadPublishInfo: self.createAndUploadPublish(wmbsWorkflow) #Load all the associated subscriptions and shoot them one by one subIDs = workflows[workflow]["workflows"][wmbsWorkflow.id] for subID in subIDs: subscription = Subscription(id = subID) subscription['workflow'] = wmbsWorkflow subscription.load() subscription.deleteEverything() #Check that the workflow is gone if wmbsWorkflow.exists(): #Something went bad, this workflow #should be gone by now msg = "Workflow %s, Task %s was not deleted completely" % (wmbsWorkflow.name, wmbsWorkflow.task) raise TaskArchiverPollerException(msg) #Now delete directories _, taskDir = getMasterName(startDir = self.jobCacheDir, workflow = wmbsWorkflow) logging.info("About to delete work directory %s" % taskDir) if os.path.exists(taskDir): if os.path.isdir(taskDir): shutil.rmtree(taskDir) else: # What we think of as a working directory is not a directory # This should never happen and there is no way we can recover # from this here. Bail out now and have someone look at things. msg = "Work directory is not a directory, this should never happen: %s" % taskDir raise TaskArchiverPollerException(msg) else: msg = "Attempted to delete work directory but it was already gone: %s" % taskDir logging.debug(msg) spec = workflows[workflow]["spec"] topTask = spec.getTopLevelTask()[0] # Now take care of the sandbox sandbox = getattr(topTask.data.input, 'sandbox', None) if sandbox: sandboxDir = os.path.dirname(sandbox) if os.path.isdir(sandboxDir): shutil.rmtree(sandboxDir) logging.debug("Sandbox dir deleted") else: logging.error("Attempted to delete sandbox dir but it was already gone: %s" % sandboxDir) except Exception, ex: msg = "Critical error while deleting workflow %s\n" % workflow msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) self.sendAlert(2, msg = msg)
def pollSubscriptions(self): """ Poller for looking in all active subscriptions for jobs that need to be made. """ logging.info("Beginning JobCreator.pollSubscriptions() cycle.") myThread = threading.currentThread() #First, get list of Subscriptions subscriptions = self.subscriptionList.execute() # Okay, now we have a list of subscriptions for subscriptionID in subscriptions: wmbsSubscription = Subscription(id = subscriptionID) try: wmbsSubscription.load() except IndexError: # This happens when the subscription no longer exists # i.e., someone executed a kill() function on the database # while the JobCreator was in cycle # Ignore this subscription msg = "JobCreator cannot load subscription %i" % subscriptionID logging.error(msg) self.sendAlert(6, msg = msg) continue workflow = Workflow(id = wmbsSubscription["workflow"].id) workflow.load() wmbsSubscription['workflow'] = workflow wmWorkload = retrieveWMSpec(workflow = workflow) if not workflow.task or not wmWorkload: # Then we have a problem # We NEED a sandbox # Abort this subscription! # But do NOT fail # We have no way of marking a subscription as bad per se # We'll have to just keep skipping it msg = "Have no task for workflow %i\n" % (workflow.id) msg += "Aborting Subscription %i" % (subscriptionID) logging.error(msg) self.sendAlert(1, msg = msg) continue logging.debug("Have loaded subscription %i with workflow %i\n" % (subscriptionID, workflow.id)) # Set task object wmTask = wmWorkload.getTaskByPath(workflow.task) # Get generators # If you fail to load the generators, pass on the job try: if hasattr(wmTask.data, 'generators'): manager = GeneratorManager(wmTask) seederList = manager.getGeneratorList() else: seederList = [] except Exception, ex: msg = "Had failure loading generators for subscription %i\n" % (subscriptionID) msg += "Exception: %s\n" % str(ex) msg += "Passing over this error. It will reoccur next interation!\n" msg += "Please check or remove this subscription!\n" logging.error(msg) self.sendAlert(6, msg = msg) continue logging.debug("Going to call wmbsJobFactory for sub %i with limit %i" % (subscriptionID, self.limit)) splitParams = retrieveJobSplitParams(wmWorkload, workflow.task) logging.debug("Split Params: %s" % splitParams) # My hope is that the job factory is smart enough only to split un-split jobs splitterFactory = SplitterFactory(splitParams.get('algo_package', "WMCore.JobSplitting")) wmbsJobFactory = splitterFactory(package = "WMCore.WMBS", subscription = wmbsSubscription, generators=seederList, limit = self.limit) # Turn on the jobFactory wmbsJobFactory.open() # Create a function to hold it jobSplittingFunction = runSplitter(jobFactory = wmbsJobFactory, splitParams = splitParams) # Now we get to find out how many jobs there are. jobNumber = self.countJobs.execute(workflow = workflow.id, conn = myThread.transaction.conn, transaction = True) jobNumber += splitParams.get('initial_lfn_counter', 0) logging.debug("Have %i jobs for workflow %s already in database." % (jobNumber, workflow.name)) continueSubscription = True while continueSubscription: # This loop runs over the jobFactory, # using yield statements and a pre-existing proxy to # generate and process new jobs # First we need the jobs. myThread.transaction.begin() try: wmbsJobGroups = jobSplittingFunction.next() logging.info("Retrieved %i jobGroups from jobSplitter" % (len(wmbsJobGroups))) except StopIteration: # If you receive a stopIteration, we're done logging.info("Completed iteration over subscription %i" % (subscriptionID)) continueSubscription = False myThread.transaction.commit() break # If we have no jobGroups, we're done if len(wmbsJobGroups) == 0: logging.info("Found end in iteration over subscription %i" % (subscriptionID)) continueSubscription = False myThread.transaction.commit() break # Assemble a dict of all the info processDict = {'workflow': workflow, 'wmWorkload': wmWorkload, 'wmTaskName': wmTask.getPathName(), 'jobNumber': jobNumber, 'sandbox': wmTask.data.input.sandbox, 'owner': wmWorkload.getOwner().get('name', None), 'ownerDN': wmWorkload.getOwner().get('dn', None), 'ownerGroup': wmWorkload.getOwner().get('vogroup', ''), 'ownerRole': wmWorkload.getOwner().get('vorole', '')} tempSubscription = Subscription(id = wmbsSubscription['id']) nameDictList = [] for wmbsJobGroup in wmbsJobGroups: # For each jobGroup, put a dictionary # together and run it with creatorProcess jobsInGroup = len(wmbsJobGroup.jobs) wmbsJobGroup.subscription = tempSubscription tempDict = {} tempDict.update(processDict) tempDict['jobGroup'] = wmbsJobGroup tempDict['swVersion'] = wmTask.getSwVersion() tempDict['scramArch'] = wmTask.getScramArch() tempDict['jobNumber'] = jobNumber tempDict['agentNumber'] = self.agentNumber jobGroup = creatorProcess(work = tempDict, jobCacheDir = self.jobCacheDir) jobNumber += jobsInGroup # Set jobCache for group for job in jobGroup.jobs: nameDictList.append({'jobid':job['id'], 'cacheDir':job['cache_dir']}) job["user"] = wmWorkload.getOwner()["name"] job["group"] = wmWorkload.getOwner()["group"] # Set the caches in the database try: if len(nameDictList) > 0: self.setBulkCache.execute(jobDictList = nameDictList, conn = myThread.transaction.conn, transaction = True) except WMException: raise except Exception, ex: msg = "Unknown exception while setting the bulk cache:\n" msg += str(ex) logging.error(msg) self.sendAlert(6, msg = msg) logging.debug("Error while setting bulkCache with following values: %s\n" % nameDictList) raise JobCreatorException(msg) # Advance the jobGroup in changeState for wmbsJobGroup in wmbsJobGroups: self.advanceJobGroup(wmbsJobGroup = wmbsJobGroup) # Now end the transaction so that everything is wrapped # in a single rollback myThread.transaction.commit()
def _getSubscription(workflow, fileset): workflow.load() fileset.load() subscription = Subscription(fileset = fileset, workflow = workflow) subscription.load() return subscription