def rejector(url, specific, options=None): up = componentInfo() if specific.startswith('/'): pass else: wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject",spec return results=[] wfi = workflowInfo(url, wfo.name) reqMgrClient.invalidateWorkflow(url, wfo.name, current_status=wfi.request['RequestStatus']) #if wfi.request['RequestStatus'] in ['assignment-approved','new','completed']: # #results.append( reqMgrClient.rejectWorkflow(url, wfo.name)) # reqMgrClient.rejectWorkflow(url, wfo.name) #else: # #results.append( reqMgrClient.abortWorkflow(url, wfo.name)) # reqMgrClient.abortWorkflow(url, wfo.name) datasets = wfi.request['OutputDatasets'] for dataset in datasets: if options.keep: print "keeping",dataset,"in its current status" else: results.append( setDatasetStatus(dataset, 'INVALID') ) if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'forget' session.commit() print wfo.name,"and",datasets,"are rejected" if options and options.clone: schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion']+=1 else: schema['ProcessingVersion']=2 ##schema.pop('RequestDate') ## ok then, let's not reset the time stamp if options.Memory: schema['Memory'] = options.Memory response = reqMgrClient.submitWorkflow(url, schema) m = re.search("details\/(.*)\'",response) if not m: print "error in cloning",wfo.name print response return newWorkflow = m.group(1) data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfo.status = 'trouble' session.commit() else: print "error in rejecting",wfo.name,results
def closor(url, specific=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'close').all() held = set() max_per_round = UC.get('max_per_round').get('closor', None) random.shuffle(wfs) if max_per_round: wfs = wfs[:max_per_round] for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced', 'normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog( 'closor', "\t%60s %d/%d = %3.2f%%" % (out, lumi_count, expected_lumis, lumi_count / float(expected_lumis) * 100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in [ 'closed-out' ]: print wfo.name, "to be announced" results = [] #'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 2 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: print "Failed DDM, retrying a second time" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor', "could not add " + out + " to DDM pool. check closor logs.", level='critical') results.append(status) if status == None: wfi.sendLog( 'closor', '%s is send to AnalysisOps DDM pool in %s copies %s' % (n_copies, out, destination_spec)) else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow is announced") else: print "ERROR with ", wfo.name, "to be announced", json.dumps( results) else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held ))) sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(held)), level='critical')
def closor(url, specific=None): if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #LI = lockInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: print "Failed DDM, retrying a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") results.append( status ) if status == None: wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( n_copies, out,destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow is announced") else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) sendEmail('waiting for files to announce', subject) sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
def actor(url, options=None): if moduleLock(wait=False, silent=True)(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) results = [] datasets = set(wfi.request['OutputDatasets']) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow( url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow family = getWorkflowById(url, wfi.request['PrepID'], details=True) for fwl in family: print "rejecting", fwl['RequestName'], fwl['RequestStatus'] wfi.sendLog( 'actor', "rejecting %s, previous status %s" % (fwl['RequestName'], fwl['RequestStatus'])) reqMgrClient.invalidateWorkflow( url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False) datasets.update(fwl['OutputDatasets']) #Invalidate all associated output datasets for dataset in datasets: results.append(setDatasetStatus(dataset, 'INVALID')) if all(map(lambda result: result in ['None', None, True], results)): wfi.sendLog('actor', "%s and children are rejected" % wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned" % wfname) #=========================================================== elif to_force: wfi.sendLog('actor', 'Bypassing from workflow traffic controler request') forcing = json.loads( open( '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json' ).read()) forcing.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json', 'w').write(json.dumps(sorted(set(forcing)))) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') holding = json.loads( open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json'). read()) holding.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json', 'w').write(json.dumps(sorted(set(holding)))) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True wfi.sendLog('actor', "%s was assigned for recovery" % acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') else: recovering.add(acdc) wfi.sendLog('actor', "ACDCs created for %s" % wfname) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog( 'actor', 'not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock' % (base_eos_dir, os.getpid())) sys.exit(-1) if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(soft=['wtc','jira']) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations',query='status=announced') if not invalids: return print len(invalids),"Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge= False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow",wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of",wfo.status,"to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn,"is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc successes = invalidate(url, wfi, only_resub=True, with_output=False) wfi.sendLog('invalidator',"rejection is performed from McM invalidations request") acknowledge= all(successes) text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid ) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting",dataset,"to",invalid_status success = setDatasetStatus(dataset , invalid_status ) if success: acknowledge= True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid ) else: msg = "Could not invalidate {}. Please consider contacting data management team for manual intervention.".format(dataset) print(msg) sendLog('invalidator', msg, level='critical') else: print "\t\t",invalid['type']," type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" ackno_url = '/restapi/invalidations/acknowledge/%s'%( invalid['_id'] ) print "at",ackno_url mcm.get(ackno_url) # prepare the text for batches batches = [] batches.extend(mcm.getA('batches',query='contains=%s'%batch_lookup)) batches = filter(lambda b : b['status'] in ['announced','done','reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to",bid text_to_batch[bid] += text+"\n\n" # prepare the text for requests text_to_request[pid] += text+"\n\n" for bid,text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify',{ "notes" : text, "prepid" : bid}) pass for pid,text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid]})
def run(self): site = self.site print "checking on site",site si = self.SI UC = self.UC RDI = self.RDI options = self.options locks = self.locks waiting = self.waiting stuck = self.stuck missing = self.missing remainings = {} ds = si.getRemainingDatasets(si.CE_to_SE(site)) #print len(ds) taken_size=0. sum_waiting=0. sum_stuck=0. sum_missing=0. sum_unlocked=0. n_ds = options.ndatasets i_ds = 0 ds_threads = [] for i_ds,(size,dataset) in enumerate(ds): if n_ds and i_ds>=n_ds: break remainings[dataset] = {"size" : size, "reasons": []} #print "-"*10 if not dataset in locks: #print dataset,"is not locked" sum_unlocked += size remainings[dataset]["reasons"].append('unlock') else: remainings[dataset]["reasons"].append('lock') if dataset in waiting: #print dataset,"is waiting for custodial" sum_waiting+=size remainings[dataset]["reasons"].append('tape') if dataset in stuck: sum_stuck+=size remainings[dataset]["reasons"].append('stuck-tape') if dataset in missing: sum_missing +=size remainings[dataset]["reasons"].append('missing-tape') ds_threads.append( DatasetCheckBuster( dataset = dataset, url = url)) run_threads = ThreadHandler( threads = ds_threads, label = '%s Dataset Threads'%site, n_threads = 10 , start_wait = 0, timeout = None, verbose=True) ## start and sync run_threads.run() #run_threads.start() #while run_threads.is_alive(): # time.sleep(10) for t in run_threads.threads: remainings[t.dataset]["reasons"].extend( t.reasons ) remainings[t.dataset]["reasons"].sort() print t.dataset,remainings[t.dataset]["reasons"] #print "\t",sum_waiting,"[GB] could be freed by custodial" print "\t",sum_unlocked,"[GB] is not locked by unified" print "updating database with remaining datasets" RDI.set(site, remainings) try: eosFile('%s/remaining_%s.json'%(monitor_dir,site),'w').write( json.dumps( remainings , indent=2)).close() except: pass ld = remainings.items() ld.sort( key = lambda i:i[1]['size'], reverse=True) table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>"%(time.asctime(time.gmtime()),site) accumulate = defaultdict(lambda : defaultdict(float)) for item in remainings: tier = item.split('/')[-1] for reason in remainings[item]['reasons']: accumulate[reason][tier] += remainings[item]['size'] table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>" for reason in accumulate: s=0 table += "<tr><td>%s</td><td><ul>"% reason subitems = accumulate[reason].items() subitems.sort(key = lambda i:i[1], reverse=True) for tier,ss in subitems: table += "<li> %s : %10.3f</li>"%( tier, ss/1024.) s+= ss/1024. table+="</ul>total : %.3f</td>"%s table += "</table>\n" table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n" only_unlock = set() for item in ld: ds_name = item[0] reasons = item[1]['reasons'] sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>'%(ds_name, ds_name) if 'unlock' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>'%(ds_name, site) if 'unlock' in reasons or 'input' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>'%(ds_name) if 'unlock' in reasons or 'output' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>'%(ds_name) if 'pilup' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>'%(ds_name) table+="<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n"%( sub_url, item[1]['size'], "<li>".join([""]+reasons)) if reasons==['unlock']: only_unlock.add(item[0]) table+="</table></html>" eosFile('%s/remaining_%s.html'%(monitor_dir,site),'w').write( table ).close() print "checking on unlock only datasets" to_ddm = UC.get('tiers_to_DDM') #look_at = list(only_unlock) look_at = list(only_unlock)[:20] #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')]) for item in look_at: tier = item.split('/')[-1] ds_status = getDatasetStatus(item) print item,ds_status if ds_status == 'PRODUCTION': print item,"is found",ds_status,"and unklocked on",site if options.invalidate_anything_left_production_once_unlocked: print "Setting status to invalid for",item setDatasetStatus(item, 'INVALID') if tier in to_ddm: print item,"looks like analysis and still dataops on",site if options.change_dataops_subs_to_anaops_once_unlocked: print "Sending",item,"to anaops" allCompleteToAnaOps(url, item)
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(mcm=use_mcm) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations',query='status=announced') print len(invalids),"Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge= False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow",wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of",wfo.status,"to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn,"is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc success = reqMgrClient.invalidateWorkflow(url, wfn, current_status = wfi.request['RequestStatus']) ## need to find the whole familly and reject the whole gang familly = getWorkflowById( url, wfi.request['PrepID'] , details=True) for fwl in familly: ## take out all acdc if fwl['RequestDate'] < wfi.request['RequestDate']:continue if fwl['RequestType']!='Resubmission': continue print "rejecting",fwl['RequestName'] success = reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus']) print success wfi.sendLog('invalidator',"rejection is performed from McM invalidations request") acknowledge= True text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid ) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting",dataset,"to",invalid_status success = setDatasetStatus(dataset , invalid_status ) if success: acknowledge= True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid ) else: print "invalidation of",dataset,"did not go so well" else: print "\t\t",invalid['type']," type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" mcm.get('/restapi/invalidations/acknowledge/%s'%( invalid['_id'] )) # prepare the text for batches batches = [] batches.extend(mcm.getA('batches',query='contains=%s'%batch_lookup)) batches = filter(lambda b : b['status'] in ['announced','done','reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to",bid text_to_batch[bid] += text+"\n\n" # prepare the text for requests text_to_request[pid] += text+"\n\n" for bid,text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify',{ "notes" : text, "prepid" : bid}) pass for pid,text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid]})
def rejector(url, specific, options=None): #use_mcm = True #up = componentInfo(mcm=use_mcm, soft=['mcm']) up = componentInfo() if not up.check(): return #use_mcm = up.status['mcm'] #mcm = McMClient(dev=False) if use_mcm else None if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs),"to reject" if len(wfs)>1: print "\n".join( [wfo.name for wfo in wfs] ) answer = raw_input('Reject these') if not answer.lower() in ['y','yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject",spec return results=[] wfi = workflowInfo(url, wfo.name) datasets = set(wfi.request['OutputDatasets']) reqMgrClient.invalidateWorkflow(url, wfo.name, current_status=wfi.request['RequestStatus']) comment="" if options.comments: comment = ", reason: "+options.comments wfi.sendLog('rejector','invalidating the workflow by unified operator%s'%comment) ## need to find the whole familly and reject the whole gang familly = getWorkflowById( url, wfi.request['PrepID'] , details=True) for fwl in familly: if fwl['RequestDate'] < wfi.request['RequestDate']: continue if fwl['RequestType']!='Resubmission': continue ## does not work on second order acd if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue print "rejecting",fwl['RequestName'] reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False) datasets.update( fwl['OutputDatasets'] ) for dataset in datasets: if options.keep: print "keeping",dataset,"in its current status" else: results.append( setDatasetStatus(dataset, 'INVALID') ) pass if all(map(lambda result : result in ['None',None,True],results)): print wfo.name,"and",datasets,"are rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion else: schema['ProcessingVersion']=2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it=1 while True: t = 'Task%d'%it it+=1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.Multicore: ## to do : set it properly in taslchains schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int,options.runs.split(',')) if options.PrepID: schema['PrepID'] =options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1,ntask-1): schema['Task%d'%it]['KeepOutput'] = False schema['TaskChain'] = ntask-1 schema.pop('Task%d'%ntask) ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 paramBlacklist = ['BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime', 'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard', 'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize', 'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType', 'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus', 'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions', 'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists', 'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles'] for p in paramBlacklist: if p in schema: schema.pop( p ) #pass print "submitting" if (options.to_stepchain and (schema['RequestType']=='TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') step=1 while True: if 'Task%d'%step in schema: schema['Step%d'%step] = schema.pop('Task%d'%step) schema['Step%d'%step]['StepName'] = schema['Step%d'%step].pop('TaskName') if 'InputTask' in schema['Step%d'%step]: schema['Step%d'%step]['InputStep'] = schema['Step%d'%step].pop('InputTask') if not 'KeepOutput' in schema['Step%d'%step]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema['Step%d'%step]['KeepOutput'] = False step+=1 else: break print json.dumps( schema, indent=2 ) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: print "error in cloning",wfo.name print json.dumps( schema, indent=2 ) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog('rejector','Cloned into %s by unified operator %s'%( newWorkflow, comment )) wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'forget' wfi.notifyRequestor('Rejected by unified operator %s'%( comment ),do_batch=False) session.commit() else: print "error in rejecting",wfo.name,results
def rejector(url, specific, options=None): up = componentInfo() if specific and specific.startswith('/'): return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: return print len(wfs),"to reject" if len(wfs)>1: print "\n".join( [wfo.name for wfo in wfs] ) answer = raw_input('Reject these') if not answer.lower() in ['y','yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject",spec return results=[] wfi = workflowInfo(url, wfo.name) datasets = set(wfi.request['OutputDatasets']) reqMgrClient.invalidateWorkflow(url, wfo.name, current_status=wfi.request['RequestStatus']) wfi.sendLog('rejector','invalidating the workflow by unified operator') ## need to find the whole familly and reject the whole gang familly = getWorkflowById( url, wfi.request['PrepID'] , details=True) for fwl in familly: if fwl['RequestDate'] < wfi.request['RequestDate']: continue if fwl['RequestType']!='Resubmission': continue if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue print "rejecting",fwl['RequestName'] reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus']) datasets.update( fwl['OutputDatasets'] ) for dataset in datasets: if options.keep: print "keeping",dataset,"in its current status" else: results.append( setDatasetStatus(dataset, 'INVALID') ) #if wfi.request['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: # print 'already',wfi.request['RequestStatus'] # if not options.clone: # wfo.status = 'forget' # session.commit() # continue if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'forget' session.commit() print wfo.name,"and",datasets,"are rejected" if options and options.clone: schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion else: schema['ProcessingVersion']=2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: schema['Memory'] = options.Memory if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int,options.runs.split(',')) if options.PrepID: schema['PrepID'] =options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1,ntask-1): schema['Task%d'%it]['KeepOutput'] = False schema['TaskChain'] = ntask-1 schema.pop('Task%d'%ntask) ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 for p in ['RequestStatus', 'RequestTransition', 'RequestorDN', 'RequestWorkflow', 'OutputDatasets', 'ReqMgr2Only', #'Group', 'RequestDate', #'ConfigCacheUrl', 'RequestName', 'timeStamp', 'SoftwareVersions', 'CouchURL' ]: if p in schema: schema.pop( p ) #pass print "submitting" print json.dumps( schema, indent=2 ) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: print "error in cloning",wfo.name print json.dumps( schema, indent=2 ) return print newWorkflow #m = re.search("details\/(.*)\'",response) #if not m: # print "error in cloning",wfo.name # print response # print json.dumps( schema, indent=2 ) # return #newWorkflow = m.group(1) data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfo.status = 'trouble' session.commit() wfi.sendLog('rejector','Cloned into %s by unified operator'%( newWorkflow )) else: print "error in rejecting",wfo.name,results
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name) continue if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') if options.force: status = True results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace('announce','announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') #batches = json.loads(open('batches.json').read()) for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to )
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name ) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'%( batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append( Output( datasetname = out )) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) self.all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: print "dealing with",out if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: print "setting valid" results.append(setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 1 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" status = pass_to_dynamo( [out], N = n_copies, sites=destinations if destinations else None, group = group_spec if group_spec else None) results.append( status ) if status == True: wfi.sendLog('closor','%s is send to dynamo in %s copies %s %s'%( out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.", level='critical') wfi.sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.") else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace('announce','announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") self.held.add( wfo.name )
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc','jira']) if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() if not wfs: batches = batchInfo().content() for bname in batches: if specific == bname: for pid in batches[bname]: b_wfs = getWorkflowById(url, pid) for wf in b_wfs: wfs.append( session.query(Workflow).filter(Workflow.name == wf).first()) break else: wfs = session.query(Workflow).filter(Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs),"to reject" if len(wfs)>1: print "\n".join( [wfo.name for wfo in wfs] ) answer = raw_input('Reject these') if not answer.lower() in ['y','yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject",spec return wfi = workflowInfo(url, wfo.name) comment="" if options.comments: comment = ", reason: "+options.comments if options.keep: wfi.sendLog('rejector','invalidating the workflow by unified operator%s'%comment) else: wfi.sendLog('rejector','invalidating the workflow and outputs by unified operator%s'%comment) results = invalidate(url, wfi, only_resub=True, with_output= (not options.keep)) if all(results): print wfo.name,"rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion else: schema['ProcessingVersion']=2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it=1 while True: t = 'Task%d'%it it+=1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks,set_to = options.Multicore.split(':') if ':' in options.Multicore else ("",options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d'% it it+=1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore',1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int((1-fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += (set_to-mcore)*mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int,options.runs.split(',')) if options.PrepID: schema['PrepID'] =options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1,ntask-1): schema['Task%d'%it]['KeepOutput'] = False schema['TaskChain'] = ntask-1 schema.pop('Task%d'%ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema( schema ) print "submitting" if (options.to_stepchain and (schema['RequestType']=='TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" mcore = 0 mem = 0 schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step=1 s_n = {} while True: if 'Task%d'%step in schema: sname = 'Step%d'%step schema[sname] = schema.pop('Task%d'%step) tmcore = schema[sname].pop('Multicore') tmem = schema[sname].pop('Memory') if mcore and tmcore != mcore: wfi.sendLog('rejector','the conversion to stepchain encoutered different value of Multicore %d != %d'%( tmcore, mcore)) sendLog('rejector','the conversion of %s to stepchain encoutered different value of Multicore %d != %d'%( wfo.name, tmcore, mcore), level='critical') mcore = max(mcore, tmcore) mem = max(mem, tmem) schema[sname]['StepName'] = schema[sname].pop('TaskName') s_n[ schema[sname]['StepName'] ] = sname if 'InputTask' in schema[sname]: schema[sname]['InputStep'] = schema[sname].pop('InputTask') eff = 1. up_s = sname while True: ## climb up a step. supposedely already all converted up_s = s_n.get(schema[up_s].get('InputStep',None),None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get('FilterEfficiency',1.) else: ## or stop there break if not 'KeepOutput' in schema[sname]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema[sname]['KeepOutput'] = False schema['TimePerEvent'] += eff*schema[sname].pop('TimePerEvent') schema['SizePerEvent'] += eff*schema[sname].pop('SizePerEvent') step+=1 else: break schema['Multicore'] = mcore schema['Memory'] = mem print json.dumps( schema, indent=2 ) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: msg = "Error in cloning {}".format(wfo.name) print(msg) wfi.sendLog('rejector',msg) # Get the error message time.sleep(5) data = reqMgrClient.requestManagerPost(url, "/reqmgr2/data/request", schema) wfi.sendLog('rejector',data) print json.dumps( schema, indent=2 ) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog('rejector','Cloned into %s by unified operator %s'%( newWorkflow, comment )) wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s'%( comment ),do_batch=False) session.commit() else: msg = "Error in rejecting {}: {}".format(wfo.name,results) print(msg) wfi.sendLog('rejector',msg)
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc', 'jira']) #if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter( Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() if not wfs: batches = batchInfo().content() for bname in batches: if specific == bname: for pid in batches[bname]: b_wfs = getWorkflowById(url, pid) for wf in b_wfs: wfs.append( session.query(Workflow).filter( Workflow.name == wf).first()) break else: wfs = session.query(Workflow).filter( Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs), "to reject" if len(wfs) > 1: print "\n".join([wfo.name for wfo in wfs]) answer = raw_input('Reject these') if not answer.lower() in ['y', 'yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject", spec return wfi = workflowInfo(url, wfo.name) comment = "" if options.comments: comment = ", reason: " + options.comments if options.keep: wfi.sendLog( 'rejector', 'invalidating the workflow by unified operator%s' % comment) else: wfi.sendLog( 'rejector', 'invalidating the workflow and outputs by unified operator%s' % comment) results = invalidate(url, wfi, only_resub=True, with_output=(not options.keep)) if all(results): print wfo.name, "rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int( schema['ProcessingVersion'] ) + 1 ## dubious str->int conversion else: schema['ProcessingVersion'] = 2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it = 1 while True: t = 'Task%d' % it it += 1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.short_task and schema['RequestType'] == 'TaskChain': translate = {} it = 1 while True: tt = 'Task%d' % it if tt in schema: tname = schema[tt]['TaskName'] ntname = 'T%d' % it translate[tname] = ntname it += 1 schema[tt]['TaskName'] = ntname if 'InputTask' in schema[tt]: itname = schema[tt]['InputTask'] schema[tt]['InputTask'] = translate[itname] else: break for k in schema.get('ProcessingString', {}).keys(): schema['ProcessingString'][ translate[k]] = schema['ProcessingString'].pop(k) for k in schema.get('AcquisitionEra', {}).keys(): schema['AcquisitionEra'][ translate[k]] = schema['AcquisitionEra'].pop(k) if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks, set_to = options.Multicore.split( ':') if ':' in options.Multicore else ( "", options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d' % it it += 1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore', 1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += ( set_to - mcore) * mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int, options.runs.split(',')) if options.PrepID: schema['PrepID'] = options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1, ntask - 1): schema['Task%d' % it]['KeepOutput'] = False schema['TaskChain'] = ntask - 1 schema.pop('Task%d' % ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema(schema) print "submitting" if (options.to_stepchain and (schema['RequestType'] == 'TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" mcore = 0 mem = 0 schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step = 1 s_n = {} while True: if 'Task%d' % step in schema: sname = 'Step%d' % step schema[sname] = schema.pop('Task%d' % step) tmcore = schema[sname].pop('Multicore') tmem = schema[sname].pop('Memory') if mcore and tmcore != mcore: wfi.sendLog( 'rejector', 'the conversion to stepchain encoutered different value of Multicore %d != %d' % (tmcore, mcore)) sendLog( 'rejector', 'the conversion of %s to stepchain encoutered different value of Multicore %d != %d' % (wfo.name, tmcore, mcore), level='critical') mcore = max(mcore, tmcore) mem = max(mem, tmem) schema[sname]['StepName'] = schema[sname].pop( 'TaskName') s_n[schema[sname]['StepName']] = sname if 'InputTask' in schema[sname]: schema[sname]['InputStep'] = schema[sname].pop( 'InputTask') eff = 1. up_s = sname while True: ## climb up a step. supposedely already all converted up_s = s_n.get( schema[up_s].get('InputStep', None), None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get( 'FilterEfficiency', 1.) else: ## or stop there break if not 'KeepOutput' in schema[sname]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema[sname]['KeepOutput'] = False schema['TimePerEvent'] += eff * schema[sname].pop( 'TimePerEvent') schema['SizePerEvent'] += eff * schema[sname].pop( 'SizePerEvent') step += 1 else: break schema['Multicore'] = mcore schema['Memory'] = mem print json.dumps(schema, indent=2) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: msg = "Error in cloning {}".format(wfo.name) print(msg) wfi.sendLog('rejector', msg) # Get the error message time.sleep(5) data = reqMgrClient.requestManagerPost( url, "/reqmgr2/data/request", schema) wfi.sendLog('rejector', data) print json.dumps(schema, indent=2) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog( 'rejector', 'Cloned into %s by unified operator %s' % (newWorkflow, comment)) #wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s' % (comment), do_batch=False) session.commit() else: msg = "Error in rejecting {}: {}".format(wfo.name, results) print(msg) wfi.sendLog('rejector', msg)
def closor(url, specific=None): if not componentInfo().check(): return CI = campaignInfo() LI = lockInfo() ## manually closed-out workflows should get to close with checkor for wfo in session.query(Workflow).filter(Workflow.status=='close').all(): if specific and not specific in wfo.name: continue ## what is the expected #lumis wl = getWorkLoad(url, wfo.name) wfo.wm_status = wl['RequestStatus'] if wl['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wl['RequestStatus'] print wfo.name,"is announced already",wfo.wm_status session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wl: print wfo.name,"has not been assigned yet, or the database is corrupted" else: expected_lumis = wl['TotalInputLumis'] ## what are the outputs outputs = wl['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = [] #print outputs if len(outputs): print wfo.name,wl['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.) #print wfo.fraction_for_closing, lumi_count, expected_lumis fraction = wfo.fraction_for_closing fraction = 0.0 all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) ## only that status can let me go into announced if wl['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for (io,out) in enumerate(outputs): if all_OK[io]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier)) campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wl and wl['Campaign']: campaign = wl['Campaign'] if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## inject to DDM when necessary passed_to_DDM=True if to_DDM: #print "Sending",out," to DDM" status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec']) if status!=0: print "Failed DDM, retrying a second time" status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec']) if status!=0: results.append("Failed DDM for %s"% out) sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") passed_to_DDM=False if passed_to_DDM: ## make a lock release LI.release_everywhere( out, reason = 'global unlock after passing to DDM') pass else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name)) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() print wfo.name,"is announced" else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wl['RequestStatus']
def actor(url,options=None): if userLock('actor'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed try: action_list = json.loads(os.popen('curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').read()) ## now we have a list of things that we can take action on except: print "Not able to load action list :(" sendLog('actor','Not able to load action list', level='critical') return print action_list if not action_list: print "EMPTY!" return for wfname in action_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False for key in action_list[wfname]: if key == 'Parameters': tasks = action_list[wfname][key] elif key == 'Action' and action_list[wfname][key] == 'acdc': print "Going to create ACDCs for ", wfname to_acdc = True elif key == 'Action' and action_list[wfname][key] == 'clone': print "Going to clone ", wfname to_clone = True if not to_acdc and not to_clone: sendLog('actor','Action submitted for something other than acdc and clone for workflow %s'%wfname,level='critical') print "Can only do acdcs and clones! Skipping workflow ",wfname continue if not tasks: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) results=[] datasets = set(wfi.request['OutputDatasets']) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow(url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow if 'ACDCs' in action_list[wfname]: children = action_list[wfname]['ACDCs'] for child in children: wfi.sendLog('actor',"rejecting %s"%child) wfi_acdc = workflowInfo(url, child) reqMgrClient.invalidateWorkflow(url, wfi_acdc.request['RequestName'], current_status=wfi_acdc.request['RequestStatus'], cascade=False) datasets.update( wfi_acdc.request['OutputDatasets'] ) #Invalidate all associated output datasets for dataset in datasets: results.append( setDatasetStatus(dataset, 'INVALID') ) if all(map(lambda result : result in ['None',None,True],results)): wfi.sendLog('actor',"%s and children are rejected"%wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) remove_action(wfname) if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned"%wfname) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print tasks all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: sendLog('actor','Cannot create ACDCS for %s because WMErr is blank.'%wfname,level='critical') print "Moving on. WMErr is blank" continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task=True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,list(actions))) team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') else: recovering.add( acdc ) wfi.sendLog('actor',"ACDCs created for %s"%wfname) #=========================================================== if recover and options.do: remove_action(wfname) if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def rejector(url, specific, options=None): up = componentInfo() if specific and specific.startswith('/'): return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter( Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: return print len(wfs), "to reject" if len(wfs) > 1: print "\n".join([wfo.name for wfo in wfs]) answer = raw_input('Reject these') if not answer.lower() in ['y', 'yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject", spec return results = [] wfi = workflowInfo(url, wfo.name) datasets = set(wfi.request['OutputDatasets']) reqMgrClient.invalidateWorkflow( url, wfo.name, current_status=wfi.request['RequestStatus']) wfi.sendLog('rejector', 'invalidating the workflow by unified operator') ## need to find the whole familly and reject the whole gang familly = getWorkflowById(url, wfi.request['PrepID'], details=True) for fwl in familly: if fwl['RequestDate'] < wfi.request['RequestDate']: continue if fwl['RequestType'] != 'Resubmission': continue if 'OriginalRequestName' in fwl and fwl[ 'OriginalRequestName'] != wfi.request['RequestName']: continue print "rejecting", fwl['RequestName'] reqMgrClient.invalidateWorkflow( url, fwl['RequestName'], current_status=fwl['RequestStatus']) datasets.update(fwl['OutputDatasets']) for dataset in datasets: if options.keep: print "keeping", dataset, "in its current status" else: results.append(setDatasetStatus(dataset, 'INVALID')) #if wfi.request['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: # print 'already',wfi.request['RequestStatus'] # if not options.clone: # wfo.status = 'forget' # session.commit() # continue if all(map(lambda result: result in ['None', None, True], results)): wfo.status = 'forget' session.commit() print wfo.name, "and", datasets, "are rejected" if options and options.clone: schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int( schema['ProcessingVersion'] ) + 1 ## dubious str->int conversion else: schema['ProcessingVersion'] = 2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: schema['Memory'] = options.Memory if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int, options.runs.split(',')) if options.PrepID: schema['PrepID'] = options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1, ntask - 1): schema['Task%d' % it]['KeepOutput'] = False schema['TaskChain'] = ntask - 1 schema.pop('Task%d' % ntask) ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 for p in [ 'RequestStatus', 'RequestTransition', 'RequestorDN', 'RequestWorkflow', 'OutputDatasets', 'ReqMgr2Only', #'Group', 'RequestDate', #'ConfigCacheUrl', 'RequestName', 'timeStamp', 'SoftwareVersions', 'CouchURL' ]: if p in schema: schema.pop(p) #pass print "submitting" print json.dumps(schema, indent=2) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: print "error in cloning", wfo.name print json.dumps(schema, indent=2) return print newWorkflow #m = re.search("details\/(.*)\'",response) #if not m: # print "error in cloning",wfo.name # print response # print json.dumps( schema, indent=2 ) # return #newWorkflow = m.group(1) data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfo.status = 'trouble' session.commit() wfi.sendLog( 'rejector', 'Cloned into %s by unified operator' % (newWorkflow)) else: print "error in rejecting", wfo.name, results
def run(self): site = self.site print "checking on site", site si = self.SI UC = self.UC RDI = self.RDI options = self.options locks = self.locks waiting = self.waiting stuck = self.stuck missing = self.missing remainings = {} ds = si.getRemainingDatasets(si.CE_to_SE(site)) #print len(ds) taken_size = 0. sum_waiting = 0. sum_stuck = 0. sum_missing = 0. sum_unlocked = 0. n_ds = options.ndatasets i_ds = 0 ds_threads = [] for i_ds, (size, dataset) in enumerate(ds): if n_ds and i_ds >= n_ds: break remainings[dataset] = {"size": size, "reasons": []} #print "-"*10 if not dataset in locks: #print dataset,"is not locked" sum_unlocked += size remainings[dataset]["reasons"].append('unlock') else: remainings[dataset]["reasons"].append('lock') if dataset in waiting: #print dataset,"is waiting for custodial" sum_waiting += size remainings[dataset]["reasons"].append('tape') if dataset in stuck: sum_stuck += size remainings[dataset]["reasons"].append('stuck-tape') if dataset in missing: sum_missing += size remainings[dataset]["reasons"].append('missing-tape') ds_threads.append(DatasetCheckBuster(dataset=dataset, url=url)) run_threads = ThreadHandler(threads=ds_threads, label='%s Dataset Threads' % site, n_threads=10, start_wait=0, timeout=None, verbose=True) ## start and sync run_threads.run() #run_threads.start() #while run_threads.is_alive(): # time.sleep(10) for t in run_threads.threads: remainings[t.dataset]["reasons"].extend(t.reasons) remainings[t.dataset]["reasons"].sort() print t.dataset, remainings[t.dataset]["reasons"] #print "\t",sum_waiting,"[GB] could be freed by custodial" print "\t", sum_unlocked, "[GB] is not locked by unified" print "updating database with remaining datasets" RDI.set(site, remainings) try: eosFile('%s/remaining_%s.json' % (monitor_dir, site), 'w').write(json.dumps(remainings, indent=2)).close() except: pass ld = remainings.items() ld.sort(key=lambda i: i[1]['size'], reverse=True) table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>" % ( time.asctime(time.gmtime()), site) accumulate = defaultdict(lambda: defaultdict(float)) for item in remainings: tier = item.split('/')[-1] for reason in remainings[item]['reasons']: accumulate[reason][tier] += remainings[item]['size'] table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>" for reason in accumulate: s = 0 table += "<tr><td>%s</td><td><ul>" % reason subitems = accumulate[reason].items() subitems.sort(key=lambda i: i[1], reverse=True) for tier, ss in subitems: table += "<li> %s : %10.3f</li>" % (tier, ss / 1024.) s += ss / 1024. table += "</ul>total : %.3f</td>" % s table += "</table>\n" table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n" only_unlock = set() for item in ld: ds_name = item[0] reasons = item[1]['reasons'] sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>' % ( ds_name, ds_name) if 'unlock' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>' % ( ds_name, site) if 'unlock' in reasons or 'input' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>' % ( ds_name) if 'unlock' in reasons or 'output' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>' % ( ds_name) if 'pilup' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>' % ( ds_name) table += "<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n" % ( sub_url, item[1]['size'], "<li>".join([""] + reasons)) if reasons == ['unlock']: only_unlock.add(item[0]) table += "</table></html>" eosFile('%s/remaining_%s.html' % (monitor_dir, site), 'w').write(table).close() print "checking on unlock only datasets" to_ddm = UC.get('tiers_to_DDM') #look_at = list(only_unlock) look_at = list(only_unlock)[:20] #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')]) for item in look_at: tier = item.split('/')[-1] ds_status = getDatasetStatus(item) print item, ds_status if ds_status == 'PRODUCTION': print item, "is found", ds_status, "and unklocked on", site if options.invalidate_anything_left_production_once_unlocked: print "Setting status to invalid for", item setDatasetStatus(item, 'INVALID') if tier in to_ddm: print item, "looks like analysis and still dataops on", site if options.change_dataops_subs_to_anaops_once_unlocked: print "Sending", item, "to anaops" allCompleteToAnaOps(url, item)
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(soft=['wtc']) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations', query='status=announced') if not invalids: return print len(invalids), "Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge = False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow", wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of", wfo.status, "to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn, "is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc successes = invalidate(url, wfi, only_resub=True, with_output=False) wfi.sendLog( 'invalidator', "rejection is performed from McM invalidations request") acknowledge = all(successes) text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting", dataset, "to", invalid_status success = setDatasetStatus(dataset, invalid_status) if success: acknowledge = True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid) else: print "invalidation of", dataset, "did not go so well" else: print "\t\t", invalid['type'], " type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" ackno_url = '/restapi/invalidations/acknowledge/%s' % ( invalid['_id']) print "at", ackno_url mcm.get(ackno_url) # prepare the text for batches batches = [] batches.extend( mcm.getA('batches', query='contains=%s' % batch_lookup)) batches = filter( lambda b: b['status'] in ['announced', 'done', 'reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to", bid text_to_batch[bid] += text + "\n\n" # prepare the text for requests text_to_request[pid] += text + "\n\n" for bid, text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify', {"notes": text, "prepid": bid}) pass for pid, text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify', { "message": text, "prepids": [pid] })
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_parentage_to_announce = UC.get('check_parentage_to_announce') check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'staged', 'staging', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append(Output(datasetname=out)) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[wfi.getCampaign()].add(completion_line) if fraction < 50: self.batch_extreme_warnings[wfi.getCampaign()].add( completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on #in_full = {} for out in outputs: all_OK[out] = True #in_full[out] = [] #presence = getDatasetPresence( url, out ) #where = [site for site,info in presence.items() if info[0]] #if where: # all_OK[out] = True # print out,"is in full at",",".join(where) # in_full[out] = copy.deepcopy(where) #else: # going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] # wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) # at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) # else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) # print json.dumps( at_destination ) # print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! #for there in going_to: # late_info = findLateFiles(url, out, going_to = there ) # for l in late_info: # l.update({"workflow":wfo.name,"dataset":out}) # self.all_late_files.extend( late_info ) # if check_fullcopy_to_announce: ## only set this false if the check is relevant # all_OK[out] = False ## verify if we have to do harvesting #if not options.no_harvest and not jump_the_line: # #(OK, requests) = spawn_harvesting(url, wfi, in_full) # sites_for_DQMHarvest = UC.get("sites_for_DQMHarvest") # (OK, requests) = spawn_harvesting(url, wfi, sites_for_DQMHarvest) # print "Harvesting workflow has been created and assigned to: " # print sites_for_DQMHarvest # all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: print "dealing with", out if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: print "setting valid" results.append( setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier in UC.get("tiers_to_rucio_relval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_rucio_nonrelval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet else: print wfo.name, "no stats for announcing", out results.append('No Stats') # adding check for PrentageResolved flag from ReqMgr: if wfi.request[ 'RequestType'] == 'StepChain' and check_parentage_to_announce: if wfi.request['ParentageResolved']: results.append(True) else: wfi.sendLog( 'closor', "Delayed announcement of %s due to unresolved Parentage dependencies" % wfi.request['RequestName']) results.append('No ParentageResolved') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace( 'announce', 'announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") self.held.add(wfo.name)
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc']) if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter( Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() if not wfs: batches = json.loads(open('batches.json').read()) for bname in batches: if specific == bname: for wf in batches[bname]: wfs.append( session.query(Workflow).filter( Workflow.name == wf).first()) else: wfs = session.query(Workflow).filter( Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs), "to reject" if len(wfs) > 1: print "\n".join([wfo.name for wfo in wfs]) answer = raw_input('Reject these') if not answer.lower() in ['y', 'yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject", spec return results = [] wfi = workflowInfo(url, wfo.name) datasets = set(wfi.request['OutputDatasets']) reqMgrClient.invalidateWorkflow( url, wfo.name, current_status=wfi.request['RequestStatus']) comment = "" if options.comments: comment = ", reason: " + options.comments wfi.sendLog( 'rejector', 'invalidating the workflow by unified operator%s' % comment) ## need to find the whole familly and reject the whole gang familly = getWorkflowById(url, wfi.request['PrepID'], details=True) for fwl in familly: if fwl['RequestDate'] < wfi.request['RequestDate']: continue if fwl['RequestType'] != 'Resubmission': continue ## does not work on second order acd #if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue print "rejecting", fwl['RequestName'] reqMgrClient.invalidateWorkflow( url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False) datasets.update(fwl['OutputDatasets']) for dataset in datasets: if options.keep: print "keeping", dataset, "in its current status" else: results.append(setDatasetStatus(dataset, 'INVALID')) pass if all(map(lambda result: result in ['None', None, True], results)): print wfo.name, "and", datasets, "are rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int( schema['ProcessingVersion'] ) + 1 ## dubious str->int conversion else: schema['ProcessingVersion'] = 2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it = 1 while True: t = 'Task%d' % it it += 1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks, set_to = options.Multicore.split( ':') if ':' in options.Multicore else ( "", options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d' % it it += 1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore', 1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += ( set_to - mcore) * mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int, options.runs.split(',')) if options.PrepID: schema['PrepID'] = options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1, ntask - 1): schema['Task%d' % it]['KeepOutput'] = False schema['TaskChain'] = ntask - 1 schema.pop('Task%d' % ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema(schema) print "submitting" if (options.to_stepchain and (schema['RequestType'] == 'TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step = 1 s_n = {} while True: if 'Task%d' % step in schema: schema['Step%d' % step] = schema.pop('Task%d' % step) schema['Step%d' % step]['StepName'] = schema[ 'Step%d' % step].pop('TaskName') s_n[schema['Step%d' % step]['StepName']] = 'Step%d' % step if 'InputTask' in schema['Step%d' % step]: schema['Step%d' % step]['InputStep'] = schema[ 'Step%d' % step].pop('InputTask') eff = 1. up_s = 'Step%d' % step while True: ## climb up a step. supposedely already all converted up_s = s_n.get( schema[up_s].get('InputStep', None), None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get( 'FilterEfficiency', 1.) else: ## or stop there break if not 'KeepOutput' in schema['Step%d' % step]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema['Step%d' % step]['KeepOutput'] = False schema['TimePerEvent'] += eff * schema[ 'Step%d' % step].pop('TimePerEvent') schema['SizePerEvent'] += eff * schema[ 'Step%d' % step].pop('SizePerEvent') step += 1 else: break print json.dumps(schema, indent=2) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: print "error in cloning", wfo.name print json.dumps(schema, indent=2) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog( 'rejector', 'Cloned into %s by unified operator %s' % (newWorkflow, comment)) wfi.notifyRequestor('Cloned into %s by unified operator %s' % (newWorkflow, comment), do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s' % (comment), do_batch=False) session.commit() else: print "error in rejecting", wfo.name, results