def archiveTasks(self): """ _archiveTasks_ This method will call several auxiliary methods to do the following: 1. Get finished workflows (a finished workflow is defined in Workflow.GetFinishedWorkflows) 2. Gather the summary information from each workflow/task and upload it to couch 3. Notify the WorkQueue about finished subscriptions 4. If all succeeds, delete all information about the workflow from couch and WMBS """ # Get the finished workflows, in descending order finishedWorkflowsDAO = self.daoFactory(classname="Workflow.GetFinishedWorkflows") finishedwfs = finishedWorkflowsDAO.execute() # Only delete those where the upload and notification succeeded logging.info("Found %d candidate workflows for deletion" % len(finishedwfs)) abortedWorkflows = self.centralCouchDBWriter.workflowsByStatus(["aborted"], format="dict") wfsToDelete = {} for workflow in finishedwfs: try: # Upload summary to couch spec = retrieveWMSpec(wmWorkloadURL=finishedwfs[workflow]["spec"]) if not spec: raise Exception(msg="Couldn't load spec from %s" % workflow[1]) self.archiveWorkflowSummary(spec=spec) # Notify the WorkQueue, if there is one if self.workQueue != None: subList = [] for l in finishedwfs[workflow]["workflows"].values(): subList.extend(l) self.notifyWorkQueue(subList) # Now we now the workflow as a whole is gone, we can delete the information from couch if not self.useReqMgrForCompletionCheck: self.wmstatsCouchDB.updateRequestStatus(workflow, "completed") logging.info("status updated to completed %s" % workflow) if workflow in abortedWorkflows: self.centralCouchDBWriter.updateRequestStatus(workflow, "aborted-completed") logging.info("status updated to aborted-completed %s" % workflow) wfsToDelete[workflow] = {"spec": spec, "workflows": finishedwfs[workflow]["workflows"]} except TaskArchiverPollerException, ex: # Something didn't go well when notifying the workqueue, abort!!! logging.error(str(ex)) self.sendAlert(1, msg=str(ex)) continue except Exception, ex: # Something didn't go well on couch, abort!!! msg = "Couldn't upload summary for workflow %s, will try again next time\n" % workflow msg += "Nothing will be deleted until the summary is in couch\n" msg += "Exception message: %s" % str(ex) print traceback.format_exc() logging.error(msg) self.sendAlert(3, msg=msg) continue
def archiveTasks(self): """ _archiveTasks_ This method will call several auxiliary methods to do the following: 1. Get finished workflows (a finished workflow is defined in Workflow.GetFinishedWorkflows) 2. Gather the summary information from each workflow/task and upload it to couch 3. Notify the WorkQueue about finished subscriptions 4. If all succeeds, delete all information about the workflow from couch and WMBS """ #Get the finished workflows, in descending order finishedWorkflowsDAO = self.daoFactory( classname="Workflow.GetFinishedWorkflows") finishedwfs = finishedWorkflowsDAO.execute() #Only delete those where the upload and notification succeeded logging.info("Found %d candidate workflows for deletion" % len(finishedwfs)) abortedWorkflows = self.centralCouchDBWriter.workflowsByStatus( ["aborted"], format="dict") wfsToDelete = {} for workflow in finishedwfs: try: #Upload summary to couch spec = retrieveWMSpec( wmWorkloadURL=finishedwfs[workflow]["spec"]) if not spec: raise Exception(msg="Couldn't load spec from %s" % workflow[1]) self.archiveWorkflowSummary(spec=spec) #Notify the WorkQueue, if there is one if self.workQueue != None: subList = [] for l in finishedwfs[workflow]["workflows"].values(): subList.extend(l) self.notifyWorkQueue(subList) #Now we now the workflow as a whole is gone, we can delete the information from couch if not self.useReqMgrForCompletionCheck: self.wmstatsCouchDB.updateRequestStatus( workflow, "completed") logging.info("status updated to completed %s" % workflow) if workflow in abortedWorkflows: self.centralCouchDBWriter.updateRequestStatus( workflow, "aborted-completed") logging.info("status updated to aborted-completed %s" % workflow) wfsToDelete[workflow] = { "spec": spec, "workflows": finishedwfs[workflow]["workflows"] } except TaskArchiverPollerException, ex: #Something didn't go well when notifying the workqueue, abort!!! logging.error(str(ex)) self.sendAlert(1, msg=str(ex)) continue except Exception, ex: #Something didn't go well on couch, abort!!! msg = "Couldn't upload summary for workflow %s, will try again next time\n" % workflow msg += "Nothing will be deleted until the summary is in couch\n" msg += "Exception message: %s" % str(ex) print traceback.format_exc() logging.error(msg) self.sendAlert(3, msg=msg) continue
def killSubscriptions(self, doneList): """ _killSubscriptions_ Actually dump the subscriptions """ for sub in doneList: logging.info("Deleting subscription %i" % sub['id']) try: sub.load() sub['workflow'].load() wf = sub['workflow'] if self.uploadPublishInfo: self.createAndUploadPublish(wf) sub.deleteEverything() workflow = sub['workflow'] if workflow.exists(): # Then there are other subscriptions attached # to the workflow continue # If we deleted the workflow, it's time to delete # the work directories # Now we have to delete the task area. workDir, taskDir = getMasterName(startDir = self.jobCacheDir, workflow = workflow) logging.info("About to delete work directory %s" % taskDir) if os.path.isdir(taskDir): # Remove the taskDir, because we're done shutil.rmtree(taskDir) else: msg = "Attempted to delete work directory but it was already gone: %s" % taskDir logging.error(msg) self.sendAlert(1, msg = msg) # Now check if the workflow is done if not workflow.countWorkflowsBySpec() == 0: continue # If the WMSpec is done, then we have to delete # the sandbox, and send off the couch summary # First load the WMSpec try: logging.debug("Loading spec to delete sandbox dir for task %s" % workflow.task) spec = retrieveWMSpec(workflow = workflow) wmTask = spec.getTaskByPath(workflow.task) except Exception, ex: # If this happens, we're well and truly screwed. # We've passed the deletion point. We can't recover # Abort this. There will be no couch summary msg = "Critical error in opening spec after workflow deletion" msg += "Task: %s" % workflow.task msg += str(ex) msg += "There will be NO workflow summary for this task" raise TaskArchiverPollerException(msg) # Then pull its info from couch and archive it self.archiveCouchSummary(workflow = workflow, spec = spec) self.deleteWorkflowFromCouch(workflowName = workflow.task.split('/')[1]) # Now take care of the sandbox sandbox = getattr(wmTask.data.input, 'sandbox', None) if sandbox: sandboxDir = os.path.dirname(sandbox) if os.path.isdir(sandboxDir): shutil.rmtree(sandboxDir) logging.debug("Sandbox dir deleted") else: logging.error("Attempted to delete sandbox dir but it was already gone: %s" % sandboxDir) except Exception, ex: msg = "Critical error while deleting subscription %i\n" % sub['id'] msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) self.sendAlert(2, msg = msg)
logging.info("Found %d candidate workflows for deletion" % len(finishedwfs)) centralCouchAlive = True try: #TODO: need to enable when reqmgr2 -wmstats is ready #abortedWorkflows = self.reqmgrCouchDBWriter.workflowsByStatus(["aborted"], format = "dict"); abortedWorkflows = self.centralCouchDBWriter.workflowsByStatus(["aborted"], format = "dict"); except Exception, ex: centralCouchAlive = False logging.error("we will try again when remote couch server comes back\n%s" % str(ex)) if centralCouchAlive: wfsToDelete = {} for workflow in finishedwfs: try: #Upload summary to couch spec = retrieveWMSpec(wmWorkloadURL = finishedwfs[workflow]["spec"]) if not spec: raise Exception(msg = "Couldn't load spec from %s" % workflow[1]) self.archiveWorkflowSummary(spec = spec) #Notify the WorkQueue, if there is one if self.workQueue != None: subList = [] for l in finishedwfs[workflow]["workflows"].values(): subList.extend(l) self.notifyWorkQueue(subList) #Now we now the workflow as a whole is gone, we can delete the information from couch if not self.useReqMgrForCompletionCheck: self.wmstatsCouchDB.updateRequestStatus(workflow, "completed") logging.info("status updated to completed %s" % workflow)