def main(): if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) if len(sys.argv) != 2: print("You must provide a request name") sys.exit(1) reqName = sys.argv[1] globalWQBackend = WorkQueueBackend(config.WorkloadSummary.couchurl, db_name="workqueue") localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue") localWQInbox = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue_inbox") gqDocIDs = globalWQBackend.getElements(RequestName=reqName) localDocIDs = localWQBackend.getElements(RequestName=reqName) localInboxDocIDs = localWQInbox.getElements(RequestName=reqName) createElementsSummary(reqName, gqDocIDs, globalWQBackend.queueUrl) createElementsSummary(reqName, localDocIDs, localWQBackend.queueUrl) createElementsSummary(reqName, localInboxDocIDs, localWQInbox.queueUrl) sys.exit(0)
def main(): """ Whatever """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) if len(sys.argv) != 2: print("You must provide a request name") sys.exit(1) reqName = sys.argv[1] globalWQBackend = WorkQueueBackend(config.WorkloadSummary.couchurl, db_name="workqueue") localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue") localWQInbox = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue_inbox") gqDocIDs = globalWQBackend.getElements(RequestName=reqName) localDocIDs = localWQBackend.getElements(RequestName=reqName) localInboxDocIDs = localWQInbox.getElements(RequestName=reqName) createElementsSummary(reqName, gqDocIDs, globalWQBackend.queueUrl) createElementsSummary(reqName, localDocIDs, localWQBackend.queueUrl) createElementsSummary(reqName, localInboxDocIDs, localWQInbox.queueUrl) sys.exit(0)
def main(): """ Whatever """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Get local workqueue and workqueue_inbox docs localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue") localWQInboxDB = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue_inbox") wqDocIDs = localWQBackend.getElements() wqInboxDocIDs = localWQInboxDB.getElements() # Build and print a summary of these elements logging.info("************* LOCAL workqueue elements summary ************") foundStatus = createElementsSummary(wqInboxDocIDs, 'workqueue_inbox') foundStatus = createElementsSummary(wqDocIDs, 'workqueue') # Now investigate docs in the workqueue database for status in foundStatus: logging.info("\n************* workqueue elements summary by status: %s ************", status) elemByStatus = [x for x in wqDocIDs if x['Status'] == status] byStatusSummary(elemByStatus, localWQInboxDB=localWQInboxDB) # time to look up at central global queue logging.info("\n************* GLOBAL workqueue elements summary ************") globalWQBackend = WorkQueueBackend(config.WorkloadSummary.couchurl, db_name="workqueue") gqDocIDs = globalWQBackend.getElements(status='Available') _ = createElementsSummary(gqDocIDs, 'workqueue') #logging.info("Found %d 'Available' docs in global workqueue database", len(gqDocIDs)) byStatusSummary(gqDocIDs) sys.exit(0)
def main(): """ Whatever """ if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Get local workqueue and workqueue_inbox docs localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue") localWQInboxDB = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue_inbox") wqDocIDs = localWQBackend.getElements() wqInboxDocIDs = localWQInboxDB.getElements() # Build and print a summary of these elements logging.info("************* LOCAL workqueue elements summary ************") foundStatus = createElementsSummary(wqInboxDocIDs, 'workqueue_inbox') foundStatus = createElementsSummary(wqDocIDs, 'workqueue') # Now investigate docs in the workqueue database for status in foundStatus: logging.info( "\n************* workqueue elements summary by status: %s ************", status) elemByStatus = [x for x in wqDocIDs if x['Status'] == status] byStatusSummary(elemByStatus, localWQInboxDB=localWQInboxDB) # time to look up at central global queue logging.info( "\n************* GLOBAL workqueue elements summary ************") globalWQBackend = WorkQueueBackend(config.WorkloadSummary.couchurl, db_name="workqueue") gqDocIDs = globalWQBackend.getElements(status='Available') _ = createElementsSummary(gqDocIDs, 'workqueue') #logging.info("Found %d 'Available' docs in global workqueue database", len(gqDocIDs)) byStatusSummary(gqDocIDs) sys.exit(0)
def checkGlobalWQStatus(config, status): """ Given a WorkQueueElement status, query central workqueue database for all elements in a given status and that were acquired by this agent. """ agentUrl = "http://" + socket.gethostname() + ":5984" backend = WorkQueueBackend(config.WorkloadSummary.couchurl) elements = backend.getElements(status=status, ChildQueueUrl=agentUrl) for elem in elements: updatedIn = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime(float(elem.updatetime))) print("id: %s\tRequestName: %s\tStatus: %s\t\tUpdatedIn: %s" % ( elem.id, elem['RequestName'], elem['Status'], updatedIn)) print("Elements matching the criteria (%s, %s) are: %d" % (status, agentUrl, len(elements))) return
def killWorkflowAgent(WorkflowName): """ Cancel work for a given workflow - delete in wmbs, delete from workqueue db, set canceled in inbox """ # get configuration file path if not os.environ.has_key("WMAGENT_CONFIG"): os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' # load config wmConfig = loadConfigurationFile(os.environ['WMAGENT_CONFIG']) wqManager = wmConfig.section_('WorkQueueManager') couchUrl = wqManager.couchurl dbname = wqManager.dbname inboxDatabase = wqManager.inboxDatabase parentQueueCouchUrl = wqManager.queueParams['ParentQueueCouchUrl'] # Creates backend backend = WorkQueueBackend(couchUrl, dbname, inboxDatabase, parentQueueCouchUrl) args = {} args['RequestName'] = WorkflowName elements = backend.getElements(**args) # take wf from args in case no elements exist for workflow (i.e. work was negotiating) requestNames = set([x['RequestName'] for x in elements]) | set( [wf for wf in [WorkflowName]]) if not requestNames: print 'Workflow is not at the backend' inbox_elements = [] for wf in requestNames: inbox_elements.extend(backend.getInboxElements(WorkflowName=wf)) print "Canceling work for workflow: %s" % (requestNames) for workflow in requestNames: try: connectToDB() jobDumpConfig = wmConfig bossAirConfig = wmConfig killWorkflow(workflow, jobDumpConfig, bossAirConfig) except Exception, ex: print 'Aborting %s wmbs subscription failed: %s' % (workflow, str(ex))
def checkLocalWQStatus(config, status): """ Given a WorkQueueElement status, query local workqueue and workqueue_inbox database for all elements in a given status and that were acquired by this agent. """ backend = WorkQueueBackend(config.WorkQueueManager.couchurl) for db in ("workqueue", "workqueue_inbox"): if db == "workqueue": elements = backend.getElements(status=status) else: elements = backend.getInboxElements(status=status) for elem in elements: updatedIn = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime(float(elem.updatetime))) print("id: %s\tRequestName: %s\tStatus: %s\t\tUpdatedIn: %s" % ( elem.id, elem['RequestName'], elem['Status'], updatedIn)) print("Elements matching the criteria (%s, %s) are: %d" % (status, db, len(elements))) return
def killWorkflowAgent(WorkflowName): """ Cancel work for a given workflow - delete in wmbs, delete from workqueue db, set canceled in inbox """ # get configuration file path if not os.environ.has_key("WMAGENT_CONFIG"): os.environ["WMAGENT_CONFIG"] = "/data/srv/wmagent/current/config/wmagent/config.py" # load config wmConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) wqManager = wmConfig.section_("WorkQueueManager") couchUrl = wqManager.couchurl dbname = wqManager.dbname inboxDatabase = wqManager.inboxDatabase parentQueueCouchUrl = wqManager.queueParams["ParentQueueCouchUrl"] # Creates backend backend = WorkQueueBackend(couchUrl, dbname, inboxDatabase, parentQueueCouchUrl) args = {} args["RequestName"] = WorkflowName elements = backend.getElements(**args) # take wf from args in case no elements exist for workflow (i.e. work was negotiating) requestNames = set([x["RequestName"] for x in elements]) | set([wf for wf in [WorkflowName]]) if not requestNames: print "Workflow is not at the backend" inbox_elements = [] for wf in requestNames: inbox_elements.extend(backend.getInboxElements(WorkflowName=wf)) print "Canceling work for workflow: %s" % (requestNames) for workflow in requestNames: try: connectToDB() jobDumpConfig = wmConfig bossAirConfig = wmConfig killWorkflow(workflow, jobDumpConfig, bossAirConfig) except Exception, ex: print "Aborting %s wmbs subscription failed: %s" % (workflow, str(ex))
def main(): """ It will either delete docs in couchdb for the workflow you have provided or it will loop over the final (or almost final) states and ask for your permission to delete them. """ args = sys.argv[1:] if not len(args) == 1: print "usage: python syncPrioReqMgrxGQ.py <text_file_with_the_workflow_names>" sys.exit(0) inputFile = args[0] with open(inputFile) as f: listWorkflows = [x.rstrip('\n') for x in f.readlines()] if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) wfDBReader = RequestDBReader( config.AnalyticsDataCollector.centralRequestDBURL, couchapp=config.AnalyticsDataCollector.RequestCouchApp) wqBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) workflowsDict = wfDBReader.getRequestByNames(listWorkflows) for wf, details in workflowsDict.iteritems(): print "wf: %s and prio: %s" % (wf, details['RequestPriority']) wqDocs = wqBackend.getElements(WorkflowName=wf) docIds = [ elem._id for elem in wqDocs if elem['Status'] == 'Available' and elem['Priority'] != details['RequestPriority'] ] if docIds: print "Changing the priority of the following available docs: %s" % docIds wqBackend.updateElements(*docIds, Priority=details['RequestPriority']) else: print " there is nothing to update for this workflow."
def main(): """ It will either delete docs in couchdb for the workflow you have provided or it will loop over the final (or almost final) states and ask for your permission to delete them. """ wfName = sys.argv[1] if len(sys.argv) == 2 else [] if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central services (couch stuff) # print "Central Couch URL : %s" % config.WorkloadSummary.couchurl # print "Central ReqMgr URL : %s\n" % config.AnalyticsDataCollector.centralRequestDBURL wfDBReader = RequestDBReader(config.AnalyticsDataCollector.centralRequestDBURL, couchapp = config.AnalyticsDataCollector.RequestCouchApp) # Central services wqBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) wqInboxDB = Database('workqueue_inbox', config.WorkloadSummary.couchurl) # Local services localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name = "workqueue_inbox") localWQInboxDB = Database('workqueue', config.WorkQueueManager.couchurl) statusList = ["failed", "epic-FAILED", "completed", "closed-out", "announced", "aborted", "aborted-completed", "rejected", "normal-archived", "aborted-archived", "rejected-archived"] for stat in final_status: # retrieve list of workflows in each status if not wfName: # options = {'include_docs': False} date_range = {'startkey': [2015,5,15,0,0,0], 'endkey': [2015,5,26,0,0,0]} # finalWfs = wfDBReader.getRequestByCouchView("bydate", options, date_range) tempWfs = wfDBReader.getRequestByCouchView("bydate", date_range) #print "Found %d wfs in status: %s" %(len(finalWfs), stat) finalWfs = [] for wf, content in tempWfs.iteritems(): if content['RequestStatus'] in statusList: finalWfs.append(wf) print "Found %d wfs in not in active state" % len(finalWfs) else: finalWfs = [wfName] tempWfs = wfDBReader.getRequestByNames(wfName, True) print "Checking %s with status '%s'." % (wfName, tempWfs[wfName]['RequestStatus']) wqDocs, wqInboxDocs = [], [] localWQDocs, localWQInboxDocs = [], [] for counter, wf in enumerate(finalWfs): if counter % 100 == 0: print "%d wfs queried ..." % counter # check whether there are workqueue docs wqDocIDs = wqBackend.getElements(WorkflowName = wf) if wqDocIDs: print "Found %d workqueue docs for %s, status %s" % (len(wqDocIDs), wf, tempWfs[wf]['RequestStatus']) print wqDocIDs wqDocs.append(wqDocIDs) # check whether there are workqueue_inbox docs if wqInboxDB.documentExists(wf): print "Found workqueue_inbox doc for %s, status %s" % (wf, tempWfs[wf]['RequestStatus']) # then retrieve the document wqInboxDoc = wqInboxDB.document(wf) wqInboxDocs.append(wqInboxDoc) # check local queue wqDocIDs = localWQBackend.getElements(WorkflowName = wf) if wqDocIDs: print "Found %d local workqueue docs for %s, status %s" % (len(wqDocIDs), wf, tempWfs[wf]['RequestStatus']) print wqDocIDs localWQDocs.append(wqDocIDs) if localWQInboxDB.documentExists(wf): print "Found local workqueue_inbox doc for %s, status %s" % (wf, tempWfs[wf]['RequestStatus']) wqInboxDoc = localWQInboxDB.document(wf) print wqInboxDoc localWQInboxDocs.append(wqInboxDoc) # TODO TODO TODO for the moment only deletes for a specific workflow if wfName: var = raw_input("\nCan we delete all these documents (Y/N)? ") if var == "Y": # deletes workqueue_inbox doc if wqInboxDoc: print "Deleting workqueue_inbox id %s and %s" % (wqInboxDoc['_id'], wqInboxDoc['_rev']) wqInboxDB.delete_doc(wqInboxDoc['_id'], wqInboxDoc['_rev']) # deletes workqueue docs if wqDocIDs: print "Deleting workqueue docs %s" % wqDocIDs wqBackend.deleteElements(*[x for x in wqDocIDs if x['RequestName'] in wfName]) else: print "You are the boss, aborting it ...\n"
class DrainStatusAPI(object): """ Provides methods for querying dbs and condor for drain statistics """ def __init__(self, config): # queue url used in WorkQueueManager self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984" self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl) self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI() def collectDrainInfo(self): """ Call methods to check the drain status """ results = {} results['workflows_completed'] = self.checkWorkflows() # if workflows are completed, collect additional drain statistics if results['workflows_completed']: results['upload_status'] = self.checkFileUploadStatus() results['condor_status'] = self.checkCondorStates() results['local_wq_status'] = self.checkLocalWQStatus(dbname="workqueue") results['local_wqinbox_status'] = self.checkLocalWQStatus(dbname="workqueue_inbox") results['global_wq_status'] = self.checkGlobalWQStatus() return results def checkWorkflows(self): """ Check to see if all workflows have a 'completed' status """ results = self.dbsUtil.isAllWorkflowCompleted() return results def checkCondorStates(self): """ Check idle and running jobs in Condor """ results = {} queries = [["1", "idle"], ["2", "running"]] for query in queries: jobs = self.condorAPI.getCondorJobs("JobStatus=="+query[0], []) # if there is an error, report it instead of the length of an empty list if jobs is None: results[query[1]] = "unknown (schedd query error)" else: results[query[1]] = len(jobs) return results def checkFileUploadStatus(self): """ Check file upload status: Blocks open in DBS Files not uploaded in DBS Files not uploaded to Phedex """ results = {} results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks() results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(status="NOTUPLOADED") results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded() return results def checkLocalWQStatus(self, dbname): """ Query local WorkQueue workqueue/workqueue_inbox database to see whether there are any active elements in this agent. """ results = {} for st in ('Available', 'Negotiating', 'Acquired', 'Running'): if dbname == "workqueue": elements = self.localBackend.getElements(status=st, returnIdOnly=True) else: elements = self.localBackend.getInboxElements(status=st, returnIdOnly=True) results[st] = len(elements) return results def checkGlobalWQStatus(self): """ Query Global WorkQueue workqueue database to see whether there are any active elements set to this agent. """ results = {} for st in ("Acquired", "Running"): elements = self.globalBackend.getElements(status=st, returnIdOnly=True, ChildQueueUrl=self.thisAgentUrl) results[st] = len(elements) return results
def main(): if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) print("Work in progress! It might create document conflicts as it is!") sys.exit(10) if len(sys.argv) != 2: print("You must provide a request name") sys.exit(1) reqName = sys.argv[1] childQueue = config.WorkQueueManager.queueParams['QueueURL'] logger = setupLogger() localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue", logger=logger) localElems = localWQBackend.getElements(WorkflowName=reqName) localInboxElems = localWQBackend.getInboxElements(WorkflowName=reqName) docsToUpdate = [] logger.info( "** Local workqueue_inbox elements for workflow %s and agent %s", reqName, childQueue) for elem in localInboxElems: if elem['Status'] == "Acquired": logger.info("Element id: %s has status: %s", elem.id, elem['Status']) elem['Status'] = 'Available' elem['ChildQueueUrl'] = None docsToUpdate.append(elem) if docsToUpdate: var = raw_input( "Found %d inbox elements to update, shall we proceed (Y/N): " % len(docsToUpdate)) if var == "Y": resp = localWQBackend.saveElements(*docsToUpdate) logger.info(" update response: %s", resp) docsToUpdate = [] logger.info("** Local workqueue elements for workflow %s and agent %s", reqName, childQueue) for elem in localElems: if elem['Status'] == "Available": logger.info("Element id: %s has status: %s", elem.id, elem['Status']) docsToUpdate.append(elem._id) if docsToUpdate: var = raw_input( "Found %d elements to delete, shall we proceed (Y/N): " % len(docsToUpdate)) if var == "Y": for elem in docsToUpdate: elem.delete() resp = docsToUpdate[0]._couch.commit() logger.info(" deletion response: %s", resp) print("Done!") sys.exit(0)
def main(): """ It will either delete docs in couchdb for the workflow you have provided or it will loop over the final (or almost final) states and ask for your permission to delete them. """ wfName = sys.argv[1] if len(sys.argv) == 2 else [] if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central services (couch stuff) # print "Central Couch URL : %s" % config.WorkloadSummary.couchurl # print "Central ReqMgr URL : %s\n" % config.AnalyticsDataCollector.centralRequestDBURL wfDBReader = RequestDBReader( config.AnalyticsDataCollector.centralRequestDBURL, couchapp=config.AnalyticsDataCollector.RequestCouchApp) # Central services wqBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) wqInboxDB = Database('workqueue_inbox', config.WorkloadSummary.couchurl) # Local services localWQBackend = WorkQueueBackend(config.WorkQueueManager.couchurl, db_name="workqueue_inbox") localWQInboxDB = Database('workqueue', config.WorkQueueManager.couchurl) statusList = [ "failed", "epic-FAILED", "completed", "closed-out", "announced", "aborted", "aborted-completed", "rejected", "normal-archived", "aborted-archived", "rejected-archived" ] for stat in final_status: # retrieve list of workflows in each status if not wfName: # options = {'include_docs': False} date_range = { 'startkey': [2015, 5, 15, 0, 0, 0], 'endkey': [2015, 5, 26, 0, 0, 0] } # finalWfs = wfDBReader.getRequestByCouchView("bydate", options, date_range) tempWfs = wfDBReader.getRequestByCouchView("bydate", date_range) #print "Found %d wfs in status: %s" %(len(finalWfs), stat) finalWfs = [] for wf, content in tempWfs.iteritems(): if content['RequestStatus'] in statusList: finalWfs.append(wf) print "Found %d wfs in not in active state" % len(finalWfs) else: finalWfs = [wfName] tempWfs = wfDBReader.getRequestByNames(wfName, True) print "Checking %s with status '%s'." % ( wfName, tempWfs[wfName]['RequestStatus']) wqDocs, wqInboxDocs = [], [] localWQDocs, localWQInboxDocs = [], [] for counter, wf in enumerate(finalWfs): if counter % 100 == 0: print "%d wfs queried ..." % counter # check whether there are workqueue docs wqDocIDs = wqBackend.getElements(WorkflowName=wf) if wqDocIDs: print "Found %d workqueue docs for %s, status %s" % ( len(wqDocIDs), wf, tempWfs[wf]['RequestStatus']) print wqDocIDs wqDocs.append(wqDocIDs) # check whether there are workqueue_inbox docs if wqInboxDB.documentExists(wf): print "Found workqueue_inbox doc for %s, status %s" % ( wf, tempWfs[wf]['RequestStatus']) # then retrieve the document wqInboxDoc = wqInboxDB.document(wf) wqInboxDocs.append(wqInboxDoc) # check local queue wqDocIDs = localWQBackend.getElements(WorkflowName=wf) if wqDocIDs: print "Found %d local workqueue docs for %s, status %s" % ( len(wqDocIDs), wf, tempWfs[wf]['RequestStatus']) print wqDocIDs localWQDocs.append(wqDocIDs) if localWQInboxDB.documentExists(wf): print "Found local workqueue_inbox doc for %s, status %s" % ( wf, tempWfs[wf]['RequestStatus']) wqInboxDoc = localWQInboxDB.document(wf) print wqInboxDoc localWQInboxDocs.append(wqInboxDoc) # TODO TODO TODO for the moment only deletes for a specific workflow if wfName: var = raw_input("\nCan we delete all these documents (Y/N)? ") if var == "Y": # deletes workqueue_inbox doc if wqInboxDoc: print "Deleting workqueue_inbox id %s and %s" % ( wqInboxDoc['_id'], wqInboxDoc['_rev']) wqInboxDB.delete_doc(wqInboxDoc['_id'], wqInboxDoc['_rev']) # deletes workqueue docs if wqDocIDs: print "Deleting workqueue docs %s" % wqDocIDs wqBackend.deleteElements( *[x for x in wqDocIDs if x['RequestName'] in wfName]) else: print "You are the boss, aborting it ...\n"
class WorkQueue(WorkQueueBase): """ _WorkQueue_ WorkQueue object - interface to WorkQueue functionality. """ def __init__(self, logger = None, dbi = None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger = self.logger) if self.params.get('ParentQueueCouchUrl'): self.parent_queue = WorkQueueBackend(self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) self.params.setdefault("GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 2) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params['QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue self.params.setdefault('Teams', ['']) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('MonteCarlo', {'name': 'MonteCarlo', 'args':{}} ) self.params['SplittingMapping'].setdefault('Dataset', {'name': 'Dataset', 'args': {}} ) self.params['SplittingMapping'].setdefault('Block', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('ResubmitBlock', {'name': 'ResubmitBlock', 'args': {}} ) self.params.setdefault('EndPolicySettings', {}) assert(self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock']['name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper(self.logger, self.backend, phedex = self.phedexService, sitedb = self.SiteDB, locationFrom = self.params['TrackLocationOrSubscription'], incompleteBlocks = self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed = not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval = self.params['FullLocationRefreshInterval'], updateIntervalCoarseness = self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender = self.alertSender, preAlert = preAlert) self.logger.debug("WorkQueue created successfully") def __len__(self): """Returns number of Available elements in queue""" return self.backend.queueLength() def __del__(self): """ Unregister itself with Alert Receiver. The registration happened in the constructor when initializing. """ if self.alertSender: self.alertSender.unregister() def setStatus(self, status, elementIDs = None, SubscriptionId = None, WorkflowName = None): """ _setStatus_, throws an exception if no elements are updated """ try: if not elementIDs: elementIDs = [] iter(elementIDs) if type(elementIDs) in types.StringTypes: raise TypeError except TypeError: elementIDs = [elementIDs] if status == 'Canceled': # Cancel needs special actions return self.cancelWork(elementIDs, SubscriptionId, WorkflowName) args = {} if SubscriptionId: args['SubscriptionId'] = SubscriptionId if WorkflowName: args['RequestName'] = WorkflowName affected = self.backend.getElements(elementIDs = elementIDs, **args) if not affected: raise WorkQueueNoMatchingElements, "No matching elements" for x in affected: x['Status'] = status elements = self.backend.saveElements(*affected) return elements def setPriority(self, newpriority, *workflowNames): """ Update priority for a workflow, throw exception if no elements affected """ self.logger.info("Priority change request to %s for %s" % (newpriority, str(workflowNames))) affected = [] for wf in workflowNames: affected.extend(self.backend.getElements(returnIdOnly = True, RequestName = wf)) self.backend.updateElements(*affected, Priority = newpriority) if not affected: raise RuntimeError, "Priority not changed: No matching elements" def resetWork(self, ids): """Put work back in Available state, from here either another queue or wmbs can pick it up. If work was Acquired by a child queue, the next status update will cancel the work in the child. Note: That the same child queue is free to pick the work up again, there is no permanent blacklist of queues. """ self.logger.info("Resetting elements %s" % str(ids)) try: iter(ids) except TypeError: ids = [ids] return self.backend.updateElements(*ids, Status = 'Available', ChildQueueUrl = None, WMBSUrl = None) def getWork(self, siteJobs): """ Get available work from the queue, inject into wmbs & mark as running siteJob is dict format of {site: estimateJobSlot} of the resources to get work for. """ results = [] if not self.backend.isAvailable(): self.logger.warning('Backend busy or down: skipping fetching of work') return results matches, _ = self.backend.availableWork(siteJobs) if not matches: return results # cache wmspecs for lifetime of function call, likely we will have multiple elements for same spec. #TODO: Check to see if we can skip spec loading - need to persist some more details to element wmspecCache = {} for match in matches: blockName, dbsBlock = None, None if self.params['PopulateFilesets']: if not wmspecCache.has_key(match['RequestName']): wmspec = self.backend.getWMSpec(match['RequestName']) wmspecCache[match['RequestName']] = wmspec else: wmspec = wmspecCache[match['RequestName']] if match['Inputs']: blockName, dbsBlock = self._getDBSBlock(match, wmspec) match['Subscription'] = self._wmbsPreparation(match, wmspec, blockName, dbsBlock) results.append(match) del wmspecCache # remove cache explicitly self.logger.info('Injected %s units into WMBS' % len(results)) return results def _getDBSBlock(self, match, wmspec): """Get DBS info for this block""" blockName = match['Inputs'].keys()[0] #TODO: Allow more than one if match['ACDC']: acdcInfo = match['ACDC'] acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"]) collection = acdc.getDataCollection(acdcInfo['collection']) splitedBlockName = ACDCBlock.splitBlockName(blockName) fileLists = acdc.getChunkFiles(acdcInfo['collection'], acdcInfo['fileset'], splitedBlockName['Offset'], splitedBlockName['NumOfFiles'], user = wmspec.getOwner().get("name"), group = wmspec.getOwner().get("group")) block = {} block["Files"] = fileLists return blockName, block else: dbs = get_dbs(match['Dbs']) if wmspec.getTask(match['TaskName']).parentProcessingFlag(): dbsBlockDict = dbs.getFileBlockWithParents(blockName) else: dbsBlockDict = dbs.getFileBlock(blockName) return blockName, dbsBlockDict[blockName] def _wmbsPreparation(self, match, wmspec, blockName, dbsBlock): """Inject data into wmbs and create subscription. """ from WMCore.WorkQueue.WMBSHelper import WMBSHelper self.logger.info("Adding WMBS subscription for %s" % match['RequestName']) mask = match['Mask'] wmbsHelper = WMBSHelper(wmspec, blockName, mask, self.params['CacheDir']) sub, match['NumOfFilesAdded'] = wmbsHelper.createSubscriptionAndAddFiles(block = dbsBlock) self.logger.info("Created top level subscription %s for %s with %s files" % (sub['id'], match['RequestName'], match['NumOfFilesAdded'])) match['SubscriptionId'] = sub['id'] match['Status'] = 'Running' self.backend.saveElements(match) return sub def _assignToChildQueue(self, queue, *elements): """Assign work from parent to queue""" for ele in elements: ele['Status'] = 'Negotiating' ele['ChildQueueUrl'] = queue ele['ParentQueueUrl'] = self.params['ParentQueueCouchUrl'] ele['WMBSUrl'] = self.params["WMBSUrl"] work = self.parent_queue.saveElements(*elements) requests = ', '.join(list(set(['"%s"' % x['RequestName'] for x in work]))) self.logger.info('Acquired work for request(s): %s' % requests) return work def doneWork(self, elementIDs = None, SubscriptionId = None, WorkflowName = None): """Mark work as done """ return self.setStatus('Done', elementIDs = elementIDs, SubscriptionId = SubscriptionId, WorkflowName = WorkflowName) def cancelWork(self, elementIDs = None, SubscriptionId = None, WorkflowName = None, elements = None): """Cancel work - delete in wmbs, delete from workqueue db, set canceled in inbox Elements may be directly provided or determined from series of filter arguments """ if not elements: args = {} if SubscriptionId: args['SubscriptionId'] = SubscriptionId if WorkflowName: args['RequestName'] = WorkflowName elements = self.backend.getElements(elementIDs = elementIDs, **args) # only cancel in global if work has not been passed to a child queue if not self.params['LocalQueueFlag']: elements = [x for x in elements if not x['ChildQueueUrl']] requestNames = set([x['RequestName'] for x in elements]) if not requestNames: return [] # if we can talk to wmbs kill the jobs if self.params['PopulateFilesets']: from WMCore.WorkQueue.WMBSHelper import killWorkflow self.logger.debug("""Canceling work in wmbs, workflows: %s""" % (requestNames)) for workflow in requestNames: try: myThread = threading.currentThread() myThread.dbi = self.conn.dbi myThread.logger = self.logger killWorkflow(workflow, self.params["JobDumpConfig"], self.params["BossAirConfig"]) except RuntimeError: #TODO: Check this logic and improve if possible if SubscriptionId: self.logger.info("""Cancel update: Only some subscription's canceled. This might be due to a child subscriptions: %s""" % elementIDs) # update parent elements to canceled for wf in requestNames: inbox_elements = self.backend.getInboxElements(WorkflowName = wf, returnIdOnly = True) if not inbox_elements: raise RuntimeError, "Cant find parent for %s" % wf self.backend.updateInboxElements(*inbox_elements, Status = 'Canceled') # delete elements - no longer need them self.backend.deleteElements(*elements) return [x.id for x in elements] def deleteWorkflows(self, *requests): """Delete requests if finished""" for request in requests: request = self.backend.getInboxElements(elementIDs = [request]) if len(request) != 1: raise RuntimeError, 'Invalid number of requests for %s' % request[0]['RequestName'] request = request[0] if request.inEndState(): self.logger.info('Deleting request "%s" as it is %s' % (request.id, request['Status'])) self.backend.deleteElements(request) else: self.logger.error('Not deleting "%s" as it is %s' % (request.id, request['Status'])) def queueWork(self, wmspecUrl, request = None, team = None): """ Take and queue work from a WMSpec. If request name is provided but doesn't match WMSpec name an error is raised. If team is provided work will only be available to queue's belonging to that team. Duplicate specs will be ignored. """ self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl) wmspec = WMWorkloadHelper() wmspec.load(wmspecUrl) # check we haven't already got this work try: self.backend.getInboxElements(elementIDs = [wmspec.name()]) except CouchNotFoundError: pass else: self.logger.warning('queueWork(): Ignoring duplicate spec "%s"' % wmspec.name()) return 1 if request: try: Lexicon.requestName(request) except Exception, ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError(wmspec, "Request name validation error: %s" % str(ex)) raise error if request != wmspec.name(): raise WorkQueueWMSpecError(wmspec, 'Request & workflow name mismatch %s vs %s' % (request, wmspec.name())) # Do splitting before we save inbound work to verify the wmspec # if the spec fails it won't enter the queue inbound = self.backend.createWork(wmspec, TeamName = team, WMBSUrl = self.params["WMBSUrl"]) # either we have already split the work or we do that now work = self.backend.getElementsForWorkflow(wmspec.name()) if work: self.logger.info('Request "%s" already split - Resuming' % str(wmspec.name())) else: work = self._splitWork(wmspec, None, inbound['Inputs'], inbound['Mask']) self.backend.insertElements(work, parent = inbound) # if this fails, rerunning will pick up here self.backend.insertElements([inbound]) # save inbound work to signal we have completed queueing return len(work)
class DrainStatusAPI(object): """ Provides methods for querying dbs and condor for drain statistics """ def __init__(self, config): # queue url used in WorkQueueManager self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984" self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl) self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl) self.dbsUtil = DBSBufferUtil() self.condorAPI = PyCondorAPI() self.condorStates = ("Running", "Idle") def collectDrainInfo(self): """ Call methods to check the drain status """ results = {} results['workflows_completed'] = self.checkWorkflows() # if workflows are completed, collect additional drain statistics if results['workflows_completed']: results['upload_status'] = self.checkFileUploadStatus() results['condor_status'] = self.checkCondorStates() results['local_wq_status'] = self.checkLocalWQStatus( dbname="workqueue") results['local_wqinbox_status'] = self.checkLocalWQStatus( dbname="workqueue_inbox") results['global_wq_status'] = self.checkGlobalWQStatus() return results def checkWorkflows(self): """ Check to see if all workflows have a 'completed' status """ results = self.dbsUtil.isAllWorkflowCompleted() return results def checkCondorStates(self): """ Check idle and running jobs in Condor """ results = {} jobs = self.condorAPI.getCondorJobsSummary() for state in self.condorStates: # if there is an error, report it instead of the length of an empty list if not jobs: results[state.lower()] = None else: results[state.lower()] = int(jobs[0].get(state)) return results def checkFileUploadStatus(self): """ Check file upload status: Blocks open in DBS Files not uploaded in DBS Files not uploaded to Phedex """ results = {} results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks() results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus( status="NOTUPLOADED") results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded() return results def checkLocalWQStatus(self, dbname): """ Query local WorkQueue workqueue/workqueue_inbox database to see whether there are any active elements in this agent. """ results = {} for st in ('Available', 'Negotiating', 'Acquired', 'Running'): if dbname == "workqueue": elements = self.localBackend.getElements(status=st, returnIdOnly=True) else: elements = self.localBackend.getInboxElements( status=st, returnIdOnly=True) results[st] = len(elements) return results def checkGlobalWQStatus(self): """ Query Global WorkQueue workqueue database to see whether there are any active elements set to this agent. """ results = {} for st in ("Acquired", "Running"): elements = self.globalBackend.getElements( status=st, returnIdOnly=True, ChildQueueUrl=self.thisAgentUrl) results[st] = len(elements) return results