def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) # Get all the files available for each subscription print "Getting files available without location..." availFiles = formatter.formatDict(myThread.dbi.processData(getFilesAvailable)) print "Total files available: %s" % len(availFiles) uniqAvailFiles = list(set([x['fileid'] for x in availFiles])) availFiles = [{'fileid': x} for x in uniqAvailFiles] print "Total unique files available: %s" % len(uniqAvailFiles) cernID = formatter.formatDict(myThread.dbi.processData(getCERNLocation))[0] print "CERN location id: %s" % cernID if not cernID: print "You need to add T0_CH_CERN to the resource control db" sys.exit(1) for fid in availFiles: fid.update(cernID) myThread.dbi.processData(updateFileLocation, availFiles) print "Done!" sys.exit(0)
def main(): if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ[ 'manage'] = '/data/srv/wmagent/current/config/wmagent/manage' # first, break free from old condor jobs condorCleanup() connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) time5d = int(time.time()) - 5 * 24 * 3600 binds = [{'timestamp': time5d}] activeRunJobs = formatter.formatDict( myThread.dbi.processData(getRunJobsActive, binds)) print "Found %d active jobs in BossAir older than 5 days" % len( activeRunJobs) # now mark these jobs as complete and in Timeout status binds = activeRunJobs[:10000] myThread.dbi.processData(updateState, binds) print "Done!" sys.exit(0)
def fixDBSmissingFileAssoc(): os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) problemFilesSql = """ select dbsbuffer_file.id as fileid, dbsbuffer_location.id as seid from wmbs_file_location fl inner join wmbs_file_details fd on fd.id = fl.fileid inner join wmbs_location_senames wls on wls.location = fl.location inner join wmbs_location wl on wl.id = fl.location inner join dbsbuffer_location on dbsbuffer_location.se_name = wls.se_name inner join dbsbuffer_file on dbsbuffer_file.lfn = fd.lfn where fd.lfn in (select df.lfn from dbsbuffer_file df left outer join dbsbuffer_file_location dfl on df.id = dfl.filename where dfl.location is null) """ unfinishedTasks = formatter.formatDict(formatter.dbi.processData(problemFilesSql)) print("%s lenth" % len(unfinishedTasks)) result = {} for row in unfinishedTasks: result.setdefault(row["fileid"], row) print(row) print("trimed %s lenth" % len(result)) insertSQL = """INSERT INTO dbsbuffer_file_location (filename, location) VALUES (:fileid, :seid)""" done = formatter.dbi.processData(insertSQL, result.values()) print("inserted %s" % done)
def fixDBSmissingFileAssoc(): os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) problemFilesSql = """ select dbsbuffer_file.id as fileid, dbsbuffer_location.id as seid from wmbs_file_location fl inner join wmbs_file_details fd on fd.id = fl.fileid inner join wmbs_location_pnns wls on wls.location = fl.location inner join wmbs_location wl on wl.id = fl.location inner join dbsbuffer_location on dbsbuffer_location.pnn = wls.pnn inner join dbsbuffer_file on dbsbuffer_file.lfn = fd.lfn where fd.lfn in (select df.lfn from dbsbuffer_file df left outer join dbsbuffer_file_location dfl on df.id = dfl.filename where dfl.location is null) """ unfinishedTasks = formatter.formatDict( formatter.dbi.processData(problemFilesSql)) print("%s lenth" % len(unfinishedTasks)) result = {} for row in unfinishedTasks: result.setdefault(row["fileid"], row) print(row) print("trimed %s lenth" % len(result)) insertSQL = """INSERT INTO dbsbuffer_file_location (filename, location) VALUES (:fileid, :seid)""" done = formatter.dbi.processData(insertSQL, result.values()) print("inserted %s" % done)
def setup(self): """ _setup_ Setup the environment, the database connection and retrieve the input. """ if "WMAGENT_CONFIG" not in os.environ: if self.configFilePath is not None: os.environ["WMAGENT_CONFIG"] = self.configFilePath else: raise RuntimeError("Config path option or the WMAGENT_CONFIG environment variable must be specified") try: connectToDB() except: logging.error("Failed to connect to the Database") raise inputDataFile = None try: inputDataFile = open(self.inputDataFilePath, 'r') self.inputData = json.load(inputDataFile) except: logging.error("Failed to load the input file with the information") raise finally: if inputDataFile is not None: inputDataFile.close()
def main(): if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' timenow = int(time.time()) time6d = timenow - 6 * 24 * 3600 connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) # Get list of workflows and number of jobs executing for more than 6 days binds = [{'timestamp': time6d}] wmbsJobsPerWf = formatter.formatDict(myThread.dbi.processData(getJobsExecuting, binds)) totalJobs = sum([int(item['count']) for item in wmbsJobsPerWf]) print "Found %d workflows with a total of %d jobs" % (len(wmbsJobsPerWf), totalJobs) #pprint(wmbsJobsPerWf) # Retrieve all jobs from condor schedd # it returns an iterator, so let's make it a list such that we can iterate over # it several times... why did I notice it only know?!?! schedd = condor.Schedd() jobs = list(schedd.xquery('true', ['ClusterID', 'ProcId', 'WMAgent_RequestName', 'JobStatus', 'WMAgent_JobID'])) # Retrieve their status from reqmgr2 and # add their wmbsId to the dict for item in wmbsJobsPerWf: item['status'] = getStatus(item['name']) item['condorjobs'] = [] for job in jobs: if job['WMAgent_RequestName'] == item['name']: item['condorjobs'].append(job['WMAgent_JobID']) #pprint(wmbsJobsPerWf) # time to have some ACTION for item in wmbsJobsPerWf: binds = [{'timestamp': time6d, 'wfname': item['name']}] jobIds = formatter.formatDict(myThread.dbi.processData(getWMBSIds, binds)) wmbsIds = [x['id'] for x in jobIds] print "%-100s in %s. Has %d wmbs and %d condor jobs" % (item['name'], item['status'], len(wmbsIds), len(item['condorjobs'])) # continue # Just skip it if there are condor jobs out there if len(item['condorjobs']) > 0 or item['status'] == 'UNKNOWN': continue newstatus = 'jobfailed' if item['status'] in ('acquired', 'running-open', 'running-closed') else 'cleanout' var = raw_input("Marking jobs from %s to %s: (Y/N) " % (item['status'], newstatus)) if var in ['Y', 'y']: print "UPDATED %s" % item['name'] binds = [] for x in jobIds: x['state'] = newstatus binds.append(x) myThread.dbi.processData(updateState, binds) print "Done!" sys.exit(0)
def checkJobCountsAgent(requestName): os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) unfinishedTasks = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_workflow.task, wmbs_job_state.name, COUNT(wmbs_job.id) AS jobcount FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id INNER JOIN wmbs_jobgroup ON wmbs_jobgroup.subscription = wmbs_subscription.id INNER JOIN wmbs_job ON wmbs_job.jobgroup = wmbs_jobgroup.id INNER JOIN wmbs_job_state ON wmbs_job.state = wmbs_job_state.id WHERE wmbs_workflow.name = '%s' AND wmbs_subscription.finished = 0 AND wmbs_job_state.name != 'cleanout' GROUP BY wmbs_workflow.task, wmbs_job_state.name""" % requestName)) result = {} for row in unfinishedTasks: if row['task'] not in result: result[row['task']] = {} result[row['task']][row['name']] = row['jobcount'] for task in result: msg = "Task %s has " % task for state in result[task]: msg += '%d jobs %s ' % (result[task][state], state) print msg if not result: print "Check #1 failed, there are no unfinished tasks in the system apparently." else: return unfinishedSubs = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_subscription.id, wmbs_workflow.task FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id WHERE wmbs_workflow.name = '%s' AND wmbs_subscription.finished = 0""" % requestName)) totalSubs = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_subscription.id, wmbs_workflow.task FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id WHERE wmbs_workflow.name = '%s'""" % requestName)) print "There are %d subscriptions for this workflow, %d are incomplete." % (len(totalSubs), len(unfinishedSubs)) if len(unfinishedSubs) != 0: print "It appears no jobs have been created for some unfinished subscriptions, check the health of the JobCreator or contact a developer." print "This workflow has all subscriptions as finished, the TaskArchiver should be eating through it now. This can take time though." return
def __init__(self): # Connecting to DB myThread = threading.currentThread() connectToDB() self.dbi = myThread.dbi # Creating DAO stuff for job discovery self.daoFactory = DAOFactory(package="WMComponent.RucioInjector.Database", logger=myThread.logger, dbinterface=self.dbi) self.getUnsubscribedDsets = self.daoFactory(classname="GetUnsubscribedDatasets") return
def main(): """ _main_ """ # Start services if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myPhEDEx = PhEDEx() myThread = threading.currentThread() print "Please remember to shutdown the PhEDExInjector first, you have 10 seconds before the script starts." time.sleep(10) # Get the files that the PhEDExInjector would look for formatter = DBFormatter(logging, myThread.dbi) formatter.sql = query results = formatter.execute() sortedBlocks = defaultdict(set) for lfn, block in results: sortedBlocks[block].add(lfn) # Check with block-level calls foundFiles = set() for block in sortedBlocks: result = myPhEDEx._getResult('data', args={'block': block}, verb='GET') for dbs in result['phedex']['dbs']: for dataset in dbs['dataset']: blockChunk = dataset['block'] for blockInfo in blockChunk: for fileInfo in blockInfo['file']: if fileInfo['lfn'] in sortedBlocks[block]: foundFiles.add(fileInfo['lfn']) if not foundFiles: print "I didn't find an abnormal file, feel free to panic!. Please contact a developer." return 0 print "Found %d files that are already registered in PhEDEx but the buffer doesn't know" % len( foundFiles) print "Fixing them now..." # Fix it! binds = [] for lfn in foundFiles: binds.append({'lfn': lfn}) formatter.dbi.processData(modification, binds, conn=None, transaction=False, returnCursor=False) print "Fixed them! :)" print "You can restart the PhEDExInjector now, have a nice day!" return 0
def main(): if "WMAGENT_CONFIG" not in os.environ: os.environ["WMAGENT_CONFIG"] = '/data/srv/wmagent/current/config/wmagent/config.py' myThread = threading.currentThread() connectToDB() formatter = DBFormatter(logging, myThread.dbi) limboFiles = formatter.formatDict(myThread.dbi.processData("""SELECT dbsbuffer_workflow.name, dbsbuffer_file.lfn FROM dbsbuffer_file INNER JOIN dbsbuffer_workflow ON dbsbuffer_file.workflow = dbsbuffer_workflow.id LEFT OUTER JOIN dbsbuffer_block ON dbsbuffer_file.block_id = dbsbuffer_block.id WHERE dbsbuffer_file.status = 'READY' AND dbsbuffer_block.id is NULL""")) if not limboFiles: print "There are no bad files to fix" return for entry in limboFiles: data = Database('wmagent_jobdump/fwjrs', 'http://%s:5984' % socket.gethostname()) result = data.loadView('FWJRDump', 'jobsByOutputLFN', {'include_docs' : True}, [[entry['name'], entry['lfn']]])['rows'] if result: result = result[0] fwjr = result['doc']['fwjr'] for step in fwjr['steps']: if step == 'cmsRun1': stepInfo = fwjr['steps'][step] site = stepInfo['site'] break else: print "Could not find location for %s" % entry['lfn'] continue se = myThread.dbi.processData("""SELECT wmbs_location_senames.se_name FROM wmbs_location_senames INNER JOIN wmbs_location ON wmbs_location.id = wmbs_location_senames.location WHERE wmbs_location.site_name = '%s'""" % site) se = formatter.formatDict(se)[0] insertQuery = """INSERT INTO dbsbuffer_location (se_name) SELECT '%s' AS se_name FROM DUAL WHERE NOT EXISTS (SELECT se_name FROM dbsbuffer_location WHERE se_name = '%s')""" % (se['se_name'], se['se_name']) myThread.dbi.processData(insertQuery) updateQuery = """INSERT INTO dbsbuffer_file_location (filename, location) SELECT df.id, dl.id FROM dbsbuffer_file df, dbsbuffer_location dl WHERE df.lfn = '%s' AND dl.se_name = '%s'""" % (entry['lfn'], se['se_name']) myThread.dbi.processData(updateQuery) updateQuery = """UPDATE dbsbuffer_file SET status = 'NOTUPLOADED' WHERE lfn = '%s'""" % entry['lfn'] myThread.dbi.processData(updateQuery)
def main(): """ _main_ """ # Start services if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myPhEDEx = PhEDEx() myThread = threading.currentThread() print "Please remember to shutdown the PhEDExInjector first, you have 10 seconds before the script starts." time.sleep(10) # Get the files that the PhEDExInjector would look for formatter = DBFormatter(logging, myThread.dbi) formatter.sql = query results = formatter.execute() sortedBlocks = defaultdict(set) for lfn, block in results: sortedBlocks[block].add(lfn) # Check with block-level calls foundFiles = set() for block in sortedBlocks: result = myPhEDEx._getResult('data', args = {'block' : block}, verb = 'GET') for dbs in result['phedex']['dbs']: for dataset in dbs['dataset']: blockChunk = dataset['block'] for blockInfo in blockChunk: for fileInfo in blockInfo['file']: if fileInfo['lfn'] in sortedBlocks[block]: foundFiles.add(fileInfo['lfn']) if not foundFiles: print "I didn't find an abnormal file, feel free to panic!. Please contact a developer." return 0 print "Found %d files that are already registered in PhEDEx but the buffer doesn't know" % len(foundFiles) print "Fixing them now..." # Fix it! binds = [] for lfn in foundFiles: binds.append({'lfn' :lfn}) formatter.dbi.processData(modification, binds, conn = None, transaction = False, returnCursor = False) print "Fixed them! :)" print "You can restart the PhEDExInjector now, have a nice day!" return 0
def killWorkflowAgent(WorkflowName): """ Cancel work for a given workflow - delete in wmbs, delete from workqueue db, set canceled in inbox """ # get configuration file path if not os.environ.has_key("WMAGENT_CONFIG"): os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' # load config wmConfig = loadConfigurationFile(os.environ['WMAGENT_CONFIG']) wqManager = wmConfig.section_('WorkQueueManager') couchUrl = wqManager.couchurl dbname = wqManager.dbname inboxDatabase = wqManager.inboxDatabase parentQueueCouchUrl = wqManager.queueParams['ParentQueueCouchUrl'] # Creates backend backend = WorkQueueBackend(couchUrl, dbname, inboxDatabase, parentQueueCouchUrl) args = {} args['RequestName'] = WorkflowName elements = backend.getElements(**args) # take wf from args in case no elements exist for workflow (i.e. work was negotiating) requestNames = set([x['RequestName'] for x in elements]) | set( [wf for wf in [WorkflowName]]) if not requestNames: print 'Workflow is not at the backend' inbox_elements = [] for wf in requestNames: inbox_elements.extend(backend.getInboxElements(WorkflowName=wf)) print "Canceling work for workflow: %s" % (requestNames) for workflow in requestNames: try: connectToDB() jobDumpConfig = wmConfig bossAirConfig = wmConfig killWorkflow(workflow, jobDumpConfig, bossAirConfig) except Exception, ex: print 'Aborting %s wmbs subscription failed: %s' % (workflow, str(ex))
def killWorkflowAgent(WorkflowName): """ Cancel work for a given workflow - delete in wmbs, delete from workqueue db, set canceled in inbox """ # get configuration file path if not os.environ.has_key("WMAGENT_CONFIG"): os.environ["WMAGENT_CONFIG"] = "/data/srv/wmagent/current/config/wmagent/config.py" # load config wmConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) wqManager = wmConfig.section_("WorkQueueManager") couchUrl = wqManager.couchurl dbname = wqManager.dbname inboxDatabase = wqManager.inboxDatabase parentQueueCouchUrl = wqManager.queueParams["ParentQueueCouchUrl"] # Creates backend backend = WorkQueueBackend(couchUrl, dbname, inboxDatabase, parentQueueCouchUrl) args = {} args["RequestName"] = WorkflowName elements = backend.getElements(**args) # take wf from args in case no elements exist for workflow (i.e. work was negotiating) requestNames = set([x["RequestName"] for x in elements]) | set([wf for wf in [WorkflowName]]) if not requestNames: print "Workflow is not at the backend" inbox_elements = [] for wf in requestNames: inbox_elements.extend(backend.getInboxElements(WorkflowName=wf)) print "Canceling work for workflow: %s" % (requestNames) for workflow in requestNames: try: connectToDB() jobDumpConfig = wmConfig bossAirConfig = wmConfig killWorkflow(workflow, jobDumpConfig, bossAirConfig) except Exception, ex: print "Aborting %s wmbs subscription failed: %s" % (workflow, str(ex))
def main(): """ _main_ """ usage = "Usage: %prog -j jobId" parser = OptionParser(usage = usage) parser.add_option('-j', '--jobId', help = 'Wmbs jobId reported in the component log', dest = 'jobId') (options, args) = parser.parse_args() if not options.jobId: parse.error('You must provide at least one jobId') print 'Example: python fixJobAccountant.py -j "1678 1679"' sys.exit(1) if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) for job in options.jobId.split(): myQuery = getQuery + str(job) output = myThread.transaction.processData(myQuery) result = formatter.format(output) reportPath = result[0][0] taskName = result[0][1] #print 'Report path: %s' % reportPath #print 'Task name: %s' % taskName jr = Report(reportPath) if jr.getTaskName(): print "Job id %s already has a TaskName %s.\nSkipping .." % (job, jr.getTaskName()) continue jr.setTaskName(taskName) jr.save(reportPath) print "Updated TaskName for fwjr for jobId: %s" % job print "Done!" return 0
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ[ 'manage'] = '/data/srv/wmagent/current/config/wmagent/manage' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) # Get all the files available for each subscription print "Getting files available without location..." availFiles = formatter.formatDict( myThread.dbi.processData(getFilesAvailable)) print "Total files available: %s" % len(availFiles) uniqAvailFiles = list(set([x['fileid'] for x in availFiles])) availFiles = [{'fileid': x} for x in uniqAvailFiles] print "Total unique files available: %s" % len(uniqAvailFiles) cernID = formatter.formatDict(myThread.dbi.processData(getCERNLocation))[0] print "CERN location id: %s" % cernID if not cernID: print "You need to add T0_CH_CERN to the resource control db" sys.exit(1) for fid in availFiles: fid.update(cernID) myThread.dbi.processData(updateFileLocation, availFiles) print "Done!" sys.exit(0)
# The default arguments are set in: # WMCORE/src/python/WMCore/WMSpec/StdSpecs/ReReco.py arguments = getTestArguments() arguments["StdJobSplitAlgo"] = "FileBased" arguments["StdJobSplitArgs"] = {"files_per_job": 1} arguments["SkimJobSplitAlgo"] = "FileBased" arguments["SkimJobSplitArgs"] = {"files_per_job": 1, "include_parents": True} if len(sys.argv) != 2: print "Usage:" print "./injectReRecoWorkflow.py PROCESSING_VERSION" sys.exit(1) else: arguments["ProcessingVersion"] = sys.argv[1] connectToDB() workloadName = "ReReco-%s" % arguments["ProcessingVersion"] workloadFile = "reReco-%s.pkl" % arguments["ProcessingVersion"] os.mkdir(workloadName) workload = rerecoWorkload(workloadName, arguments) workloadPath = os.path.join(workloadName, workloadFile) workload.setOwner("*****@*****.**") workload.setSpecUrl(workloadPath) # Build a sandbox using TaskMaker taskMaker = TaskMaker(workload, os.path.join(os.getcwd(), workloadName)) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadPath)
arguments["AlcaRecoJobSplitAlgo"] = "FileBased" arguments["AlcaRecoJobSplitArgs"] = {"files_per_job": 1} if len(sys.argv) != 6: print "Usage:" print sys.argv[ 0], "PROCESSING_VERSION NUM_EVENTS GenConfigCacheID RecoConfigCacheID AlcaRecoConfigCacheID" sys.exit(1) else: arguments["ProcessingVersion"] = sys.argv[1] numEvents = int(sys.argv[2]) arguments["GenConfigCacheID"] = sys.argv[3] arguments["RecoConfigCacheID"] = sys.argv[4] arguments["AlcaRecoConfigCacheID"] = sys.argv[5] connectToDB() workloadName = "RelValMC-%s" % arguments["ProcessingVersion"] workloadFile = "relValMC-%s.pkl" % arguments["ProcessingVersion"] os.mkdir(workloadName) workload = relValMCWorkload(workloadName, arguments) workloadPath = os.path.join(workloadName, workloadFile) workload.setOwner("*****@*****.**") workload.setSpecUrl(workloadPath) # Build a sandbox using TaskMaker taskMaker = TaskMaker(workload, os.path.join(os.getcwd(), workloadName)) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadPath)
def getWMBSInfo(config): connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) workflows = formatter.formatDict(myThread.dbi.processData(knownWorkflows)) workflows = [wf['name'] for wf in workflows] print("\n*** WORKFLOWS: found %d distinct workflows in this agent." % len(workflows)) workflowsDict = fetchWorkflowsSpec(config, workflows) printWfStatus(workflows, workflowsDict) for st in ('Available', 'Negotiating', 'Acquired', 'Running'): print( "\n*** WORKQUEUE: elements still marked as %s in LQ workqueue / workqueue_inbox." % st) checkLocalWQStatus(config, st) for st in ("Acquired", "Running"): print("\n*** WORKQUEUE: elements still marked as %s in GQ workqueue." % st) checkGlobalWQStatus(config, st) workflows = formatter.formatDict(myThread.dbi.processData(incompleteWfs)) workflows = [wf['name'] for wf in workflows] print("\n*** WORKFLOWS: there are %d distinct workflows not completed." % len(workflows)) printWfStatus(workflows, workflowsDict) wfsNotInjected = flattenList( formatter.format(myThread.dbi.processData(workflowsNotInjected))) print("\n*** WORKFLOWS: found %d workflows not fully injected." % len(wfsNotInjected)) printWfStatus(wfsNotInjected, workflowsDict) jobsByState = formatter.formatDict( myThread.dbi.processData(jobCountByState)) print("\n*** WMBS: amount of wmbs jobs in each status:\n%s" % jobsByState) # IF we have executing jobs in wmbs and nothing in condor, then investigate the wfs if 'executing' in [item['name'] for item in jobsByState]: wfsJobCount = formatter.formatDict( myThread.dbi.processData(workflowsExecuting)) print("\n*** WMBS: %d workflows with executing jobs in wmbs:" % len(wfsJobCount)) workflows = [wf['name'] for wf in wfsJobCount] printWfStatus(workflows, workflowsDict) unfinishedSubs = formatter.formatDict( myThread.dbi.processData(unfinishedSubscriptions)) unfinishedSubs = [wf['wfname'] for wf in unfinishedSubs] print("\n*** SUBSCRIPTIONS: subscriptions not finished: %d" % len(unfinishedSubs)) printWfStatus(unfinishedSubs, workflowsDict) filesAvailable = formatter.formatDict( myThread.dbi.processData(filesAvailWMBS)) print( "\n*** SUBSCRIPTIONS: found %d files available in WMBS (waiting for job creation):\n%s" % (len(filesAvailable), filesAvailable)) filesAcquired = formatter.formatDict( myThread.dbi.processData(filesAcqWMBS)) print( "\n*** SUBSCRIPTIONS: found %d files acquired in WMBS (waiting for jobs to finish):\n%s" % (len(filesAcquired), filesAcquired)) blocksopenDBS = formatter.formatDict( myThread.dbi.processData(blocksOpenDBS)) print("\n*** DBS: found %d blocks open in DBS." % len(blocksopenDBS), end="") print(" Printing the first 20 blocks only:\n%s" % blocksopenDBS[:20]) filesnotinDBS = flattenList( formatter.format(myThread.dbi.processData(filesNotInDBS))) print("\n*** DBS: found %d files not uploaded to DBS.\n" % len(filesnotinDBS)) getDsetAndWf(filesnotinDBS, workflowsDict) filesnotinPhedex = flattenList( formatter.format(myThread.dbi.processData(filesNotInPhedex))) print( "\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (recoverable)." % len(filesnotinPhedex)) getDsetAndWf(filesnotinPhedex, workflowsDict) filesnotinPhedexNull = flattenList( formatter.format(myThread.dbi.processData(filesNotInPhedexNull))) print( "\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (unrecoverable)." % len(filesnotinPhedexNull)) getDsetAndWf(filesnotinPhedexNull, workflowsDict)
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ[ 'manage'] = '/data/srv/wmagent/current/config/wmagent/manage' connectToDB() myPhEDEx = PhEDEx() myDBS = DBS3Reader('https://cmsweb.cern.ch/dbs/prod/global/DBSReader/') myThread = threading.currentThread() print "Shutting down PhEDExInjector..." subprocess.call([ os.environ['manage'], "execute-agent", "wmcoreD", "--shutdown", "--component=PhEDExInjector" ], stdout=open(os.devnull, 'wb')) time.sleep(5) ## TASK1: query DB for files not injected in phedex yet # Get the files that the PhEDExInjector would look for formatter = DBFormatter(logging, myThread.dbi) formatter.sql = getQuery results = formatter.execute() fileList = [] fileList = [lfn[0] for lfn in results] ## TASK2: makes lfns a bit shorter to sort and uniq them reducedLfns = [lfn.rsplit('/', 2)[0] for lfn in fileList] reducedLfns = list(set(reducedLfns)) ## TASK3: build uniq dataset names and check whether PhEDEx and DBS contain ## the same number of files. If so, then those lfns are healthy print "Checking %d dataset in both PhEDEx and DBS ..." % len(reducedLfns) crippleLfns, healthyLfns = [], [] i = 0 n = len(reducedLfns) for lfn in reducedLfns: try: lfnAux = lfn.split('/') dset = '/' + lfnAux[4] + '/' + lfnAux[3] + '-' + lfnAux[ 6] + '/' + lfnAux[5] result = myPhEDEx._getResult('blockreplicas', args={'dataset': dset}, verb='GET') phedexFiles = 0 for item in result["phedex"]["block"]: phedexFiles += item['files'] ## TODO: ValidFile is only available for > 0.9.95pre5. Once all agents are ## upgraded, then we can start using this new query. #result = myDBS.listDatasetFileDetails(dset) #dbsFiles = 0 #for item in result.itervalues(): # dbsFiles += 1 if item['ValidFile'] else 0 # This call returns valid+invalid number of filesfiles result = myDBS.listDatasetFiles(dset) dbsFiles = len(result) if phedexFiles == dbsFiles: healthyLfns.append(lfn) else: crippleLfns.append(lfn) except: print "Error with:", lfn i += 1 if i % 100 == 0: print '%d/%d files processed' % (i, n) ## TASK4: map the short cripple and healthy lists to the full original lfns ## TODO: this code looks terrible... IMPROVE IT! if crippleLfns: filesToCheck = [] for lfn in crippleLfns: #filesToCheck = [file for file in fileList if lfn in file] for file in fileList: if lfn in file: filesToCheck.append(file) else: filesToCheck = [] if healthyLfns: filesInPhedex = [] for lfn in healthyLfns: #filesInPhedex = [file for file in fileList if lfn in file] for file in fileList: if lfn in file: filesInPhedex.append(file) else: filesInPhedex = [] ## TASK5: query PhEDEx for each cripple file (filesToCheck) ## and build the final file lists missingFiles = [] i = 0 n = len(filesToCheck) for file in filesToCheck: try: result = myPhEDEx._getResult('data', args={'file': file}, verb='GET') if len(result['phedex']['dbs']): filesInPhedex.append(file) else: missingFiles.append(file) except: print "Error contacting Phedex", file i += 1 if i % 100 == 0: print '%d/%d files processed' % (i, n) if not filesInPhedex: print "There are no files to be updated in the buffer. Contact a developer." print "Starting PhEDExInjector now ..." subprocess.call([ os.environ['manage'], "execute-agent", "wmcoreD", "--start", "--component=PhEDExInjector" ], stdout=open(os.devnull, 'wb')) return 0 print "Found %d out of %d files that are already registered in PhEDEx \ but buffer doesn't know" % (len(filesInPhedex), len(fileList)) print "Fixing them now, it may take several minutes ..." ## TASK6: time to actually fix these files binds = [] for file in filesInPhedex: binds.append({'lfn': file}) formatter.dbi.processData(setQuery, binds, conn=None, transaction=False, returnCursor=False) print "Rows were successfully updated! Good job!" print "Starting PhEDExInjector now ..." subprocess.call([ os.environ['manage'], "execute-agent", "wmcoreD", "--start", "--component=PhEDExInjector" ], stdout=open(os.devnull, 'wb')) print "Done!" return 0
def getWMBSInfo(config): """ blah :return: """ connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) workflows = formatter.formatDict(myThread.dbi.processData(knownWorkflows)) workflows = [wf['name'] for wf in workflows] print("\n*** WORKFLOWS: found %d distinct workflows in this agent." % len(workflows)) workflowsDict = fetchWorkflowsSpec(config, workflows) printWfStatus(workflows, workflowsDict) for st in ('Available', 'Negotiating', 'Acquired', 'Running'): print("\n*** WORKQUEUE: elements still marked as %s in LQ workqueue / workqueue_inbox." % st) checkLocalWQStatus(config, st) for st in ("Acquired", "Running"): print("\n*** WORKQUEUE: elements still marked as %s in GQ workqueue." % st) checkGlobalWQStatus(config, st) workflows = formatter.formatDict(myThread.dbi.processData(incompleteWfs)) workflows = [wf['name'] for wf in workflows] print("\n*** WORKFLOWS: there are %d distinct workflows not completed." % len(workflows)) printWfStatus(workflows, workflowsDict) wfsNotInjected = flattenList(formatter.format(myThread.dbi.processData(workflowsNotInjected))) print("\n*** WORKFLOWS: found %d workflows not fully injected." % len(wfsNotInjected)) printWfStatus(wfsNotInjected, workflowsDict) jobsByState = formatter.formatDict(myThread.dbi.processData(jobCountByState)) print("\n*** WMBS: amount of wmbs jobs in each status:\n%s" % jobsByState) # IF we have executing jobs in wmbs and nothing in condor, then investigate the wfs if 'executing' in [item['name'] for item in jobsByState]: wfsJobCount = formatter.formatDict(myThread.dbi.processData(workflowsExecuting)) print("\n*** WMBS: %d workflows with executing jobs in wmbs:" % len(wfsJobCount)) workflows = [wf['name'] for wf in wfsJobCount] printWfStatus(workflows, workflowsDict) unfinishedSubs = formatter.formatDict(myThread.dbi.processData(unfinishedSubscriptions)) unfinishedSubs = [wf['wfname'] for wf in unfinishedSubs] print("\n*** SUBSCRIPTIONS: subscriptions not finished: %d" % len(unfinishedSubs)) printWfStatus(unfinishedSubs, workflowsDict) filesAvailable = formatter.formatDict(myThread.dbi.processData(filesAvailWMBS)) print("\n*** SUBSCRIPTIONS: found %d files available in WMBS (waiting for job creation):\n%s" % (len(filesAvailable), filesAvailable)) filesAcquired = formatter.formatDict(myThread.dbi.processData(filesAcqWMBS)) print("\n*** SUBSCRIPTIONS: found %d files acquired in WMBS (waiting for jobs to finish):\n%s" % (len(filesAcquired), filesAcquired)) blocksopenDBS = formatter.formatDict(myThread.dbi.processData(blocksOpenDBS)) print("\n*** DBS: found %d blocks open in DBS." % len(blocksopenDBS), end="") print(" Printing the first 20 blocks only:\n%s" % blocksopenDBS[:20]) filesnotinDBS = flattenList(formatter.format(myThread.dbi.processData(filesNotInDBS))) print("\n*** DBS: found %d files not uploaded to DBS.\n" % len(filesnotinDBS)) getDsetAndWf(filesnotinDBS, workflowsDict) filesnotinPhedex = flattenList(formatter.format(myThread.dbi.processData(filesNotInPhedex))) print("\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (recoverable)." % len(filesnotinPhedex)) getDsetAndWf(filesnotinPhedex, workflowsDict) filesnotinPhedexNull = flattenList(formatter.format(myThread.dbi.processData(filesNotInPhedexNull))) print("\n*** PHEDEX: found %d files not injected in PhEDEx, with valid block id (unrecoverable)." % len(filesnotinPhedexNull)) getDsetAndWf(filesnotinPhedexNull, workflowsDict)
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' connectToDB() myPhEDEx = PhEDEx() myDBS = DBS3Reader('https://cmsweb.cern.ch/dbs/prod/global/DBSReader/') myThread = threading.currentThread() print "Shutting down PhEDExInjector..." subprocess.call([os.environ['manage'], "execute-agent", "wmcoreD", "--shutdown", "--component=PhEDExInjector"], stdout=open(os.devnull, 'wb')) time.sleep(5) ## TASK1: query DB for files not injected in phedex yet # Get the files that the PhEDExInjector would look for formatter = DBFormatter(logging, myThread.dbi) formatter.sql = getQuery results = formatter.execute() fileList = [] fileList = [lfn[0] for lfn in results] ## TASK2: makes lfns a bit shorter to sort and uniq them reducedLfns = [lfn.rsplit('/',2)[0] for lfn in fileList] reducedLfns = list(set(reducedLfns)) ## TASK3: build uniq dataset names and check whether PhEDEx and DBS contain ## the same number of files. If so, then those lfns are healthy print "Checking %d dataset in both PhEDEx and DBS ..." % len(reducedLfns) crippleLfns, healthyLfns = [], [] i = 0 n = len(reducedLfns) for lfn in reducedLfns: try: lfnAux = lfn.split ('/') dset = '/'+lfnAux[4]+'/'+lfnAux[3]+'-'+lfnAux[6]+'/'+lfnAux[5] result = myPhEDEx._getResult('blockreplicas', args = {'dataset' : dset}, verb = 'GET') phedexFiles = 0 for item in result["phedex"]["block"]: phedexFiles += item['files'] ## TODO: ValidFile is only available for > 0.9.95pre5. Once all agents are ## upgraded, then we can start using this new query. #result = myDBS.listDatasetFileDetails(dset) #dbsFiles = 0 #for item in result.itervalues(): # dbsFiles += 1 if item['ValidFile'] else 0 # This call returns valid+invalid number of filesfiles result = myDBS.listDatasetFiles(dset) dbsFiles = len(result) if phedexFiles == dbsFiles: healthyLfns.append(lfn) else: crippleLfns.append(lfn) except: print "Error with:",lfn i += 1 if i % 100 == 0: print '%d/%d files processed'%(i,n) ## TASK4: map the short cripple and healthy lists to the full original lfns ## TODO: this code looks terrible... IMPROVE IT! if crippleLfns: filesToCheck = [] for lfn in crippleLfns: #filesToCheck = [file for file in fileList if lfn in file] for file in fileList: if lfn in file: filesToCheck.append(file) else: filesToCheck = [] if healthyLfns: filesInPhedex = [] for lfn in healthyLfns: #filesInPhedex = [file for file in fileList if lfn in file] for file in fileList: if lfn in file: filesInPhedex.append(file) else: filesInPhedex = [] ## TASK5: query PhEDEx for each cripple file (filesToCheck) ## and build the final file lists missingFiles = [] i = 0 n = len(filesToCheck) for file in filesToCheck: try: result = myPhEDEx._getResult('data', args = {'file' : file}, verb = 'GET') if len(result['phedex']['dbs']): filesInPhedex.append(file) else: missingFiles.append(file) except: print "Error contacting Phedex", file i += 1 if i % 100 == 0: print '%d/%d files processed'%(i,n) if not filesInPhedex: print "There are no files to be updated in the buffer. Contact a developer." print "Starting PhEDExInjector now ..." subprocess.call([os.environ['manage'], "execute-agent", "wmcoreD", "--start", "--component=PhEDExInjector"], stdout=open(os.devnull, 'wb')) return 0 print "Found %d out of %d files that are already registered in PhEDEx \ but buffer doesn't know" % (len(filesInPhedex), len(fileList)) print "Fixing them now, it may take several minutes ..." ## TASK6: time to actually fix these files binds = [] for file in filesInPhedex: binds.append({'lfn': file}) formatter.dbi.processData(setQuery, binds, conn = None, transaction = False, returnCursor = False) print "Rows were successfully updated! Good job!" print "Starting PhEDExInjector now ..." subprocess.call([os.environ['manage'], "execute-agent", "wmcoreD", "--start", "--component=PhEDExInjector"], stdout=open(os.devnull, 'wb')) print "Done!" return 0
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' ### Fetch the report pickle files from the component log command = ["tail", "-n1000", "install/wmagent/JobAccountant/ComponentLog"] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() logFiles = [line for line in out.splitlines() if 'install/wmagent/JobCreator/JobCache' in line] logFiles = [i.split()[2] for i in logFiles] msg = "Found %d pickle files to parse " % len(logFiles) ### Now unpickle each of these files and get their output files # also check whether any of them are duplicate lfn2PklDict = {} dupOutputPkl = {} # string value with the dup LFN and keyed by the pickle file path jobReport = Report() for pklPath in logFiles: if not os.path.exists(pklPath): continue jobReport.load(pklPath) for e in jobReport.getAllFiles(): lfn2PklDict.setdefault(e['lfn'], []) lfn2PklDict[e['lfn']].append(pklPath) # now check which files contain more than one pickle path (= created by diff jobs) dupFiles = [] for lfn, pkls in lfn2PklDict.iteritems(): if len(pkls) > 1: dupFiles.append(lfn) for pkl in pkls: if pkl not in dupOutputPkl: jobReport.load(pkl) dupOutputPkl[pkl] = jobReport.__to_json__(None) dupOutputPkl[pkl]['dup_lfns'] = [] dupOutputPkl[pkl]['dup_lfns'].append(lfn) msg += "with a total of %d output files and %d duplicated" % (len(lfn2PklDict), len(dupFiles)) msg += " files to process among them." msg += "\nDuplicate files are:\n%s" % dupFiles print(msg) if dupFiles: print("See dupPickles.json for further details ...") with open('dupPickles.json', 'w') as fo: json.dump(dupOutputPkl, fo, indent=2) if dupFiles: var = raw_input("Can we automatically delete those pickle files? Y/N\n") if var == "Y": # then delete all job report files but the first one - NOT ideal for fname in dupFiles: for pklFile in lfn2PklDict[fname][1:]: if os.path.isfile(pklFile): print("Deleting %s ..." % pklFile) os.remove(pklFile) else: print(" File has probably been already deleted %s ..." % pklFile) print(" Done!") ### Time to load all - this is BAD - LFNs from WMBS database print("\nNow loading all LFNs from wmbs_file_details ...") connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) output = myThread.transaction.processData("SELECT lfn FROM wmbs_file_details") lfnsDB = formatter.format(output) lfnsDB = [item[0] for item in lfnsDB] print("Retrieved %d lfns from wmbs_file_details" % len(lfnsDB)) ### Compare what are the duplicates dupFiles = list(set(lfn2PklDict.keys()) & set(lfnsDB)) print("\nFound %d duplicate files." % len(dupFiles)) if len(dupFiles) == 0: sys.exit(0) ### Print some basic data about these reports print("Their overview is: ") dbDupPkl = [] for fname in dupFiles: for pklPath in lfn2PklDict[fname]: jobInfo = {'lfn': fname} jobInfo['pklPath'] = pklPath jobReport.load(pklPath) jobInfo['exitCode'] = jobReport.getExitCode() jobInfo['taskSuccess'] = jobReport.taskSuccessful() jobInfo['EOSLogURL'] = jobReport.getLogURL() jobInfo['HostName'] = jobReport.getWorkerNodeInfo()['HostName'] jobInfo['Site'] = jobReport.getSiteName() jobInfo['task'] = jobReport.getTaskName() dbDupPkl.append(jobInfo) print(pformat(dbDupPkl)) print("") print("Remove them, restart the component and be happy!\n") sys.exit(0)
def checkJobCountsAgent(requestName): os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) unfinishedTasks = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_workflow.task, wmbs_job_state.name, COUNT(wmbs_job.id) AS jobcount FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id INNER JOIN wmbs_jobgroup ON wmbs_jobgroup.subscription = wmbs_subscription.id INNER JOIN wmbs_job ON wmbs_job.jobgroup = wmbs_jobgroup.id INNER JOIN wmbs_job_state ON wmbs_job.state = wmbs_job_state.id WHERE wmbs_workflow.name = '%s' AND wmbs_subscription.finished = 0 AND wmbs_job_state.name != 'cleanout' GROUP BY wmbs_workflow.task, wmbs_job_state.name""" % requestName)) result = {} for row in unfinishedTasks: if row['task'] not in result: result[row['task']] = {} result[row['task']][row['name']] = row['jobcount'] for task in result: msg = "Task %s has " % task for state in result[task]: msg += '%d jobs %s ' % (result[task][state], state) print msg if not result: print "Check #1 failed, there are no unfinished tasks in the system apparently." else: return unfinishedSubs = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_subscription.id, wmbs_workflow.task FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id WHERE wmbs_workflow.name = '%s' AND wmbs_subscription.finished = 0""" % requestName)) totalSubs = formatter.formatDict(myThread.dbi.processData("""SELECT wmbs_subscription.id, wmbs_workflow.task FROM wmbs_workflow INNER JOIN wmbs_subscription ON wmbs_subscription.workflow = wmbs_workflow.id WHERE wmbs_workflow.name = '%s'""" % requestName)) print "There are %d subscriptions for this workflow, %d are incomplete." % (len(totalSubs), len(unfinishedSubs)) if len(unfinishedSubs) != 0: for sub in unfinishedSubs: subId = sub['id'] availableFiles = formatter.formatDict(myThread.dbi.processData("""SELECT COUNT(wmbs_sub_files_available.fileid) AS count FROM wmbs_sub_files_available WHERE wmbs_sub_files_available.subscription = %s""" % subId)) acquiredFiles = formatter.formatDict(myThread.dbi.processData("""SELECT COUNT(wmbs_sub_files_acquired.fileid) AS count FROM wmbs_sub_files_acquired WHERE wmbs_sub_files_acquired.subscription = %s""" % subId)) print "There are %s files available and %s files acquired in the subscription %s. If the JobCreator is up, more jobs will appear soon." % (availableFiles[0]['count'], acquiredFiles[0]['count'], subId) else: print "This workflow has all subscriptions as finished, the TaskArchiver should be eating through it now. This can take time though." return