def cachor(spec=None): mlock = moduleLock(silent=True) if mlock(): print "currently running" return TS = transferStatuses() print sorted(TS.all()), "cached transfers" ## pop all that are now in inactive for phedexid in TS.all(): transfers = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.active == True).all() if not transfers: print phedexid, "does not look relevant to be in cache anymore. poping" TS.pop(phedexid) all_transfers = set() for imp in session.query(TransferImp).filter( TransferImp.active == True).all(): all_transfers.add(imp.phedexid) all_transfers = list(all_transfers) random.shuffle(all_transfers) existing = map(int, TS.all()) new = (set(all_transfers) - set(existing)) print len(new), "transfers not look out at all, will do those first", new if spec: new = [int(spec)] new = list(new) random.shuffle(new) #for transfer in all_transfers: for phedexid in all_transfers: #print phedexid if new and phedexid != new[0]: continue print "running the check on", phedexid new_check = checkTransferStatus(url, phedexid, nocollapse=True) if new_check: print json.dumps(new_check, indent=2) TS.add(phedexid, new_check) else: print "withouth an update, we are in some trouble." sendLog('cachor', 'Failed transfer status check on %s' % phedexid, level='critical') #do only one break
def rulor(spec=None, options=None): mlock = moduleLock() if mlock(): return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return if spec: wfs = session.query(Workflow).filter( Workflow.status.contains('manual')).filter( Workflow.name.contains(spec)).all() else: wfs = session.query(Workflow).filter( Workflow.status.contains('manual')).all() COI = closeoutInfo() RI = reportInfo() WC = wtcClient() JC = JIRAClient() ## a list of function with a given trace ( wfi, record, report) => (action dict list) rules = [ majority_of_139_nanoaod, majority_of_71104, ] for wfo in wfs: wfi = workflowInfo(reqmgr_url, wfo.name) record = COI.get(wfo.name) report = RI.get(wfo.name) if not record: print "no information to look at" continue print "close out information as in the assistance page" print json.dumps(record, indent=2) print "report information as in the unified report" print json.dumps(report, indent=2) ## parse the information and produce an action document ### a rule for on-going issue with memory in campaign ... acted = False for condition in rules: acts = condition(wfi, record, report) if acts: print "list of actions being taken for", wfo.name for a in acts: print json.dumps(a, indent=2) if not options.test: acted = True WC.set_actions(acts) wfo.status = wfo.status.replace('manual', 'acting') session.commit() break if acted: continue if "some conditions": action_doc = { 'workflow': wfo.name, 'name': "a task name", 'parameters': { 'action': 'acdc', 'memory': 5000 } } acted = True if acted: continue if "majority of 139": pass if acted: continue
if sub_lap: print "[lockor] Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "[lockor] Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) time_point("Starting initialization") url = reqmgr_url mlock = moduleLock() if mlock(): sys.exit(0) use_mcm=True up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): sys.exit(0) use_mcm = up.status['mcm'] mcm=None if use_mcm: print "mcm interface is up" mcm = McMClient(dev=False) statuses = ['assignment-approved','assigned','failed','acquired','running-open','running-closed','force-complete','completed','closed-out'] UC = unifiedConfiguration()
def stuckor(url = reqmgr_url): mlock = moduleLock() if mlock(): return TD = transferDataset() datasets_by_phid = TD.content() really_stuck_dataset = set(json.loads(eosRead('%s/really_stuck_dataset.json'%base_eos_dir))) UC = unifiedConfiguration() print "make a report of stuck transfers" bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" transfer_timeout = UC.get("transfer_timeout") transfer_lowrate = UC.get("transfer_lowrate") for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>transfer_timeout and rate<transfer_lowrate: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) print "add",dataset,"to really stuck" report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print '\n'*2,"Datasets really stuck" print '\n'.join( really_stuck_dataset ) print '\n'*2,"report written at %s/logs/incomplete_transfers.log"%unified_url print report missing_in_action = json.loads(eosRead('%s/incomplete_transfers.json'%monitor_dir)) stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]) print '\n'*2,'Stuck dataset transfers' print json.dumps(stuck_transfers , indent=2) eosFile('%s/stuck_transfers.json'%monitor_pub_dir,'w').write( json.dumps(stuck_transfers , indent=2) ).close() eosFile('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report ).close()
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows), "in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: wfi.sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) sendLog('injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status), level='critical') print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False ## add a check on validity of input datasets _, prim, par, sec = wfi.getIO() for d in list(prim) + list(par) + list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog( 'injector', "One of the input is not VALID. %s : %s" % (d, status_cache[d])) sendLog('injector', "One of the input of %s is not VALID. %s : %s" % (wf, d, status_cache[d]), level='critical') can_add = False ## check for any file in phedex, to verify existence _, ph_files, _, _ = getDatasetFiles(url, d) if not ph_files and not ('StoreResults' == wfi.request.setdefault( 'RequestType', None)): wfi.sendLog( 'injector', "One of the input has no file in phedex: %s" % d) sendLog('injector', "One of the input has no file in phedex: %s" % d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None) ## match keywords and technical constraints #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): # to_convert.add( wf ) # wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) # #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') for wf in to_convert: os.system( './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s' % wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget' ) wf.status = 'forget' session.commit() else: wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf) session.add(tri) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement\n%s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira'] ) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows),"in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical') print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical') can_add = False #else: # ##make sure that all blocks get closed # closeAllBlocks(url, d) ## check for any file in phedex, to verify existence _,ph_files,_,_ = getDatasetFiles(url, d) if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ): wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d ) sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) ## match keywords and technical constraints if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): to_convert.add( wf ) wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical') for wf in to_convert: os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget') wf.status = 'forget' session.commit() else: wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp( phedexid = tr.phedexid, workflow = new_wf) session.add( tri ) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
import glob import os import socket from utils import UnifiedLock UL = UnifiedLock(acquire=False) UL.deadlock() ## get rid of deadlock in mongodb from utils import moduleLock mlock = moduleLock(component='deadlock') mlock.check()
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock(): return up = componentInfo(soft=['mcm','wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status=='close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs),"closing" random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key = lambda r : r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank( wfn ): return all_closedout.index( wfn ) if wfn in all_closedout else 0 wfs = sorted( wfs, key = lambda wfo : rank( wfo.name ),reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs),"closing" th_start = time.mktime(time.gmtime()) for iwfo,wfo in enumerate(wfs): if specific and not specific in wfo.name: continue closers.append( CloseBuster( wfo = wfo, url = url, CI = CI, UC = UC, jump_the_line = jump_the_line, batch_goodness = batch_goodness, batch_go = batch_go, #stats = stats, batch_warnings = batch_warnings, all_late_files = all_late_files, held = held, )) run_threads = ThreadHandler( threads = closers, n_threads = options.threads, sleepy = 10, timeout = None, verbose = True, label = 'closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira',False) else None print len(run_threads.threads),"finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out session.add( outO ) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid" : to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop( to.wfo.name ) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop-th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads/run_threads.n_threads) > 0: sendLog('checkor','%d/%d threads have failed, better check this out'% (failed_threads, run_threads.n_threads), level='critical') sendEmail('checkor','%d/%d threads have failed, better check this out'% (failed_threads,run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to ) ## just announced ; take it out now. BI.pop( bname ) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor','Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
print "[lockor] Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "[lockor] Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) time_point("Starting initialization") url = reqmgr_url mlock = moduleLock() if mlock(): sys.exit(0) use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): sys.exit(0) use_mcm = up.status['mcm'] mcm = None if use_mcm: print "mcm interface is up" mcm = McMClient(dev=False) statuses = [ 'assignment-approved', 'assigned', 'failed', 'staging', 'staged', 'acquired', 'running-open', 'running-closed', 'force-complete',
def stuckor(url=reqmgr_url): mlock = moduleLock() if mlock(): return TD = transferDataset() datasets_by_phid = TD.content() really_stuck_dataset = set( json.loads(eosRead('%s/really_stuck_dataset.json' % base_eos_dir))) UC = unifiedConfiguration() print "make a report of stuck transfers" bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" transfer_timeout = UC.get("transfer_timeout") transfer_lowrate = UC.get("transfer_lowrate") for phid, datasets in datasets_by_phid.items(): issues = checkTransferLag(url, phid, datasets=list(datasets)) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size, destination_size, delay, rate, dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones = [] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer', 'MSS').replace('Export', 'MSS') in dones: continue else: redones.append(d) else: redones.append(d) dones = list(set(redones)) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay > transfer_timeout and rate < transfer_lowrate: if len(dones) > 1: ## its the destination that sucks bad_destinations[destination].add(block) else: dum = [bad_sources[d].add(block) for d in dones] really_stuck_dataset.add(dataset) print "add", dataset, "to really stuck" report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n" % ( block, destination, ", ".join(dones), rate, delay) print "\n" * 2 ## create tickets right away ? report += "\nbad sources " + ",".join(bad_sources.keys()) + "\n" for site, blocks in bad_sources.items(): report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks)) report += "\nbad destinations " + ",".join(bad_destinations.keys()) + "\n" for site, blocks in bad_destinations.items(): report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks)) print '\n' * 2, "Datasets really stuck" print '\n'.join(really_stuck_dataset) print '\n' * 2, "report written at %s/logs/incomplete_transfers.log" % unified_url print report missing_in_action = json.loads( eosRead('%s/incomplete_transfers.json' % monitor_dir)) stuck_transfers = dict([(k, v) for (k, v) in missing_in_action.items() if k in really_stuck_dataset]) print '\n' * 2, 'Stuck dataset transfers' print json.dumps(stuck_transfers, indent=2) eosFile('%s/stuck_transfers.json' % monitor_pub_dir, 'w').write(json.dumps(stuck_transfers, indent=2)).close() eosFile('%s/logs/incomplete_transfers.log' % monitor_dir, 'w').write(report).close()
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() CloseI.pop(wfo.name) wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname)
for site in RDI.sites(): load = RDI.get(site) if si.disk[site] : continue print site,si.disk[site],"[TB] free",si.quota[site],"[TB] quota" if not load: continue tags = ['pilup','input','output','lock','unlock','tape','stuck-tape','missing-tape'] for tag in tags: v = sum([ info['size'] for ds,info in load.items() if tag in info['reasons']]) / 1024. print "\t %10f [TB] remaining because of %s"%(v,tag) if __name__ == "__main__": url = 'cmsweb.cern.ch' mlock = moduleLock(component='remainor',locking=False) ml=mlock() parser = optparse.OptionParser() parser.add_option('-s','--site', help="coma separated list of site to parse", default="") parser.add_option('-n','--nsites',help="number of site to parse", default=0, type=int) parser.add_option('-d','--ndatasets',help="number of top datasets to parse", default=0, type=int) parser.add_option('--subs-to-anaops', dest='change_dataops_subs_to_anaops_once_unlocked', default=False, action='store_true') parser.add_option('--invalidate', dest='invalidate_anything_left_production_once_unlocked', default=False, action='store_true') (options,args) = parser.parse_args() parse( options )
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: print "getting wf in",status wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if wfh.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled+=1 continue if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled+=1 wfh.sendLog('assignor','There is no output at all') sendLog('assignor','Workflow %s has no output at all'%( wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=rwl ) )) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, lumis=lwl))) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction( url, sec ) if sec_availability >=1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog('assignor',"The secondary %s is available %s times on disk, and usable"%( sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog('assignor',"The secondary %s is nowhere on disk"% sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"Intersecting with secondary requirement, now allowed %s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) ## if they are requested for processing, they should bbe all closed already closeAllBlocks(url, prim, blocks) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction(url, prim, only_blocks = blocks) sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] if primary_aaa: sites_all_data = set() for (psite,(there,frac)) in presence.items(): if there: sites_all_data.update( SI.SE_to_CEs(psite) ) sites_all_data = list(sites_all_data) #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there])) sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] if primary_aaa: sites_with_any_data = set() for psite in presence.keys(): sites_with_any_data.update( SI.SE_to_CEs(psite) ) sites_with_any_data = list(sites_with_any_data) #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()])) holding_but_not_allowed = set() for se_site in presence.keys(): if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)): holding_but_not_allowed.add( se_site ) #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted( holding_but_not_allowed )) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) isStoreResults = ( 'StoreResults' == wfh.request.setdefault('RequestType',None) ) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled+= 1 wfh.sendLog('assignor',"Cannot assign StoreResults request because MergedLFN is missing") sendLog('assignor','Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog('assignor',"Cannot assign StoreResults request because SiteWhitelist is missing") sendLog('assignor','Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v def pick_campaign( assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update( assign_parameters.get('parameters',{}) ) if options.force_options: pick_campaign( assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign( assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog('assignor','Holding on to the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor','Change of splitting is on hold') n_stalled+=1 continue if split_check==None or split_check==False: n_stalled+=1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog('assignor','Applying the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list(set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites',[]))) result = reqMgrClient.assignWorkflow(url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock( secure, reason = 'assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor',"%s workflows cannot be assigned. Please take a look"%(n_stalled), level='critical')
def actor(url, options=None): if moduleLock(wait=False, silent=True)(): return if userLock('actor'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed try: action_list = json.loads( os.popen( 'curl -s -k https://vocms0113.cern.ch:80/getaction?days=15'). read()) ## now we have a list of things that we can take action on except: try: action_list = json.loads( os.popen( 'curl -s -k https://vocms0113.cern.ch/getaction?days=15'). read()) except: print "Not able to load action list :(" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) results = [] datasets = set(wfi.request['OutputDatasets']) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow( url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow family = getWorkflowById(url, wfi.request['PrepID'], details=True) for fwl in family: print "rejecting", fwl['RequestName'], fwl['RequestStatus'] wfi.sendLog( 'actor', "rejecting %s, previous status %s" % (fwl['RequestName'], fwl['RequestStatus'])) reqMgrClient.invalidateWorkflow( url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False) datasets.update(fwl['OutputDatasets']) #Invalidate all associated output datasets for dataset in datasets: results.append(setDatasetStatus(dataset, 'INVALID')) if all(map(lambda result: result in ['None', None, True], results)): wfi.sendLog('actor', "%s and children are rejected" % wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again #remove_action(wfname) if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned" % wfname) #=========================================================== elif to_force: wfi.sendLog('actor', 'Bypassing from workflow traffic controler request') forcing = json.loads( open( '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json' ).read()) forcing.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json', 'w').write(json.dumps(sorted(set(forcing)))) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') holding = json.loads( open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json'). read()) holding.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json', 'w').write(json.dumps(sorted(set(holding)))) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True wfi.sendLog('actor', "%s was assigned for recovery" % acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') else: recovering.add(acdc) wfi.sendLog('actor', "ACDCs created for %s" % wfname) #=========================================================== if recover and options.do: remove_action(wfname) if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
parser.add_option('--max', help='Limit the number of indexion', default=0, type=int) parser.add_option('--force', help='Re-insert information', default=False, action='store_true') (options, args) = parser.parse_args() specific = options.workflow.split(',') if options.workflow else None check_months = options.months.split(',') if options.months else None check_years = options.years.split(',') if options.years else None ml = moduleLock(component='createLogDB_%s' % options.workflow, wait=True, silent=True) if ml(): print "existing createLogDB", options.workflow sys.exit(1) if check_years: years = check_years else: years = filter(None, os.popen('ls /eos/cms/store/logs/prod/').read().split('\n')) vetoes = [ 'Express_Run', 'PromptReco_Run', 'Repack_Run', 'Validation', 'test', 'Test' ] print years
year = int(time.strftime("%Y", time.gmtime())) lastyear = year-1 parser = optparse.OptionParser() parser.add_option('--workflow', help='Which workflow logs', default=None) parser.add_option('--years',help='What year to parse', default='%d,%d'%(year, lastyear)) parser.add_option('--months',help='What month to parse', default=None) parser.add_option('--max',help='Limit the number of indexion', default=0, type=int) parser.add_option('--force',help='Re-insert information', default=False,action='store_true') (options,args) = parser.parse_args() specific = options.workflow.split(',') if options.workflow else None check_months = options.months.split(',') if options.months else None check_years= options.years.split(',') if options.years else None ml = moduleLock( component='createLogDB_%s'%options.workflow, wait=True, silent=True) if ml(): print "existing createLogDB",options.workflow sys.exit(1) if check_years: years = check_years else: years = filter(None,os.popen('ls /eos/cms/store/logs/prod/').read().split('\n')) vetoes = ['Express_Run','PromptReco_Run','Repack_Run','Validation','test','Test'] print years n_index=0 for year in years: if options.max and n_index>options.max: break
tags = [ 'pilup', 'input', 'output', 'lock', 'unlock', 'tape', 'stuck-tape', 'missing-tape' ] for tag in tags: v = sum([ info['size'] for ds, info in load.items() if tag in info['reasons'] ]) / 1024. print "\t %10f [TB] remaining because of %s" % (v, tag) if __name__ == "__main__": url = 'cmsweb.cern.ch' mlock = moduleLock(component='remainor', locking=False) ml = mlock() parser = optparse.OptionParser() parser.add_option('-s', '--site', help="coma separated list of site to parse", default="") parser.add_option('-n', '--nsites', help="number of site to parse", default=0, type=int) parser.add_option('-d', '--ndatasets', help="number of top datasets to parse",
def completor(url, specific): mlock = moduleLock(silent=True) if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) safe_mode = False CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() JC = JIRAClient() if up.status.get('jira', False) else None wfs = [] wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all()) wfs.extend( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) ## just take it in random order so that not always the same is seen random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('completor', None) if max_per_round and not specific: wfs = wfs[:max_per_round] all_stuck = set() ## take into account what stagor was saying for itry in range(5): try: all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) break except: time.sleep(2) for itry in range(5): try: ## take into account the block that needed to be repositioned recently all_stuck.update([ b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json' % monitor_dir)) ]) break except: time.sleep(2) ## take into account all stuck block and dataset from transfer team all_stuck.update(getAllStuckDataset()) good_fractions = {} overdoing_fractions = {} truncate_fractions = {} timeout = {} campaign_injection_delay = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'truncate-complete' in CI.campaigns[c]: truncate_fractions[c] = CI.campaigns[c]['truncate-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] if 'injection-delay' in CI.campaigns[c]: campaign_injection_delay[c] = CI.campaigns[c]['injection-delay'] if 'overdoing-complete' in CI.campaigns[c]: overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete'] long_lasting = {} WI = wtcInfo() overrides = WI.getForce() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps(good_fractions, indent=2) print "can truncate complete on" print json.dumps(truncate_fractions, indent=2) print "can overide on" print json.dumps(overrides, indent=2) max_force = UC.get("max_force_complete") max_priority = UC.get("max_tail_priority") injection_delay_threshold = UC.get("injection_delay_threshold") injection_delay_priority = UC.get("injection_delay_priority") delay_priority_increase = UC.get("delay_priority_increase") default_fraction_overdoing = UC.get('default_fraction_overdoing') set_force_complete = set() # priority and time above which to fire a JIRA jira_priority_and_delays = { 110000: 21, 90000: 28, # 80000 : 60, #0 : 90 } for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at", wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip = False campaigns = wfi.getCampaigns() #if not any([c in good_fractions.keys() for c in campaigns]): skip=True #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True for user, spec in overrides.items(): if not spec: continue spec = filter(None, spec) if not wfi.request['RequestStatus'] in [ 'force-complete', 'completed' ]: if any(s in wfo.name for s in spec) or (wfo.name in spec) or any( pid in spec for pid in pids) or any(s in pids for s in spec): wfi = workflowInfo(url, wfo.name) forceComplete(url, wfi) skip = True wfi.notifyRequestor( "The workflow %s was force completed by request of %s" % (wfo.name, user), do_batch=False) wfi.sendLog( 'completor', '%s is asking for %s to be force complete' % (user, wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in [ 'acquired', 'running-open', 'running-closed' ]: continue ## until we can map the output to task ... output_per_task = wfi.getOutputPerTask( ) ## can use that one, and follow mapping good_fraction_per_out = {} good_fraction_nodelay_per_out = {} truncate_fraction_per_out = {} #allowed_delay_per_out = {} for task, outs in output_per_task.items(): task_campaign = wfi.getCampaignPerTask(task) for out in outs: good_fraction_per_out[out] = good_fractions.get( task_campaign, 1000.) good_fraction_nodelay_per_out[out] = overdoing_fractions.get( task_campaign, default_fraction_overdoing) truncate_fraction_per_out[out] = truncate_fractions.get( task_campaign, 1000.) #allowed_delay_per_out[out] = timeout.get(task_campaign, 14) #print "force at", json.dumps( good_fraction_per_out, indent=2) #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2) now = time.mktime(time.gmtime()) / (60 * 60 * 24.) priority_log = filter(lambda change: change['Priority'] == priority, wfi.request.get('PriorityTransition', [])) if not priority_log: print "\tHas no priority log" priority_delay = 0 else: then = max([change['UpdateTime'] for change in priority_log]) / (60. * 60. * 24.) priority_delay = now - then ## in days print "priority was set to", priority, priority_delay, "[days] ago" running_log = filter( lambda change: change["Status" ] in ["running-open", "running-closed"], wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" delay = 0 else: then = max([change['UpdateTime'] for change in running_log]) / (60. * 60. * 24.) delay = now - then ## in days #further check on delays cpuh = wfi.getComputingTime(unit='d') wfi.sendLog( 'completor', "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago" % (cpuh, delay, priority, priority_delay)) if priority_delay != 0 and priority_delay < delay: ## regardless when it started running, set the delay to when priority was changed last delay = priority_delay ## this is supposed to be the very initial request date, inherited from clones injection_delay = None original = wfi if 'OriginalRequestName' in original.request: ## go up the clone chain original = workflowInfo(url, original.request['OriginalRequestName']) injected_log = filter( lambda change: change["Status"] in ["assignment-approved"], original.request['RequestTransition']) if injected_log: injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.) injection_delay = now - injected_on delay_for_priority_increase = injection_delay #delay_for_priority_increase = delay (w, d) = divmod(delay, 7) print "\t" * int( w) + "Running since", delay, "[days] priority=", priority pop_a_jira = False ping_on_jira = 7 * (24 * 60 * 60) # 7 days for jp, jd in jira_priority_and_delays.items(): if priority >= jp and delay >= jd: pop_a_jira = True if pop_a_jira and JC: j, reopened, just_created = JC.create_or_last( prepid=wfi.request['PrepID'], priority=wfi.request['RequestPriority'], label='Late', reopen=True) last_time = JC.last_time(j) since_last_ping = time.mktime(time.gmtime()) - last_time if since_last_ping > ping_on_jira or just_created: j_comment = "Running since %.1f [days] at priority %d" % ( delay, priority) JC.comment(j.key, j_comment) if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority: quantized = 5000 ## quantize priority tail_cutting_priority = wfi.request['InitialPriority'] + int( (delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized tail_cutting_priority += 101 ## to signal it is from this mechanism tail_cutting_priority = min( 400000, tail_cutting_priority) ## never go above 400k priority tail_cutting_priority = max( tail_cutting_priority, priority) ## never go below the current value if priority < tail_cutting_priority: if max_priority: sendLog( 'completor', "%s Injected since %s [days] priority=%s, increasing to %s" % (wfo.name, delay_for_priority_increase, priority, tail_cutting_priority), level='critical') wfi.sendLog( 'completor', 'bumping priority to %d for being injected since %s' % (tail_cutting_priority, delay_for_priority_increase)) reqMgrClient.changePriorityWorkflow( url, wfo.name, tail_cutting_priority) max_priority -= 1 else: sendLog( 'completor', "%s Injected since %s [days] priority=%s, would like to increase to %s" % (wfo.name, delay_for_priority_increase, priority, tail_cutting_priority), level='critical') wfi.sendLog( 'completor', 'would like to bump priority to %d for being injected since %s' % (tail_cutting_priority, delay_for_priority_increase)) print "Could be changing the priority to higher value, but too many already were done" _, prim, _, _ = wfi.getIO() is_stuck = all_stuck & prim if is_stuck: wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck)) monitor_delay = 7 allowed_delay = max([timeout.get(c, 14) for c in campaigns]) monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early, just for the sake of not computing the completion fraction just now. # maybe this is fast enough that we can do it for all if delay <= monitor_delay: print "not enough time has passed yet" continue long_lasting[wfo.name] = { "delay": delay, "injection_delay": injection_delay } percent_completions = wfi.getCompletionFraction(caller='completor') if not percent_completions: sendLog('completor', '%s has no output at all' % wfo.name, level='critical') continue is_over_allowed_delay = (all([ percent_completions[out] >= good_fraction_per_out.get(out, 1000.) for out in percent_completions ]) and delay >= allowed_delay) is_over_truncation_delay = (is_stuck and (all([ percent_completions[out] >= truncate_fraction_per_out.get( out, 1000.) for out in percent_completions ])) and delay >= allowed_delay) is_over_completion = (all([ percent_completions[out] >= good_fraction_nodelay_per_out.get( out, 1000.) for out in percent_completions ])) if is_over_completion: wfi.sendLog( 'completor', "all is over completed %s\n %s" % (json.dumps(good_fraction_nodelay_per_out, indent=2), json.dumps(percent_completions, indent=2))) elif is_over_allowed_delay: wfi.sendLog( 'completor', "all is above %s \n%s" % (json.dumps(good_fraction_per_out, indent=2), json.dumps(percent_completions, indent=2))) elif is_over_truncation_delay: wfi.sendLog( 'completor', "all is above %s truncation level, and the input is stuck\n%s" % (json.dumps(truncate_fraction_per_out, indent=2), json.dumps(percent_completions, indent=2))) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions': percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog( 'completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s" % (json.dumps(percent_completions, indent=2), json.dumps(good_fraction_per_out, indent=2), json.dumps(truncate_fraction_per_out, indent=2), json.dumps(long_lasting[wfo.name]['agents'], indent=2))) continue #for output in percent_completions: # completions[output]['injected'] = then ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days ## find ACDCs that might be running if max_force > 0: print "going for force-complete of", wfo.name if not safe_mode: forceComplete(url, wfi) set_force_complete.add(wfo.name) wfi.sendLog('completor', 'going for force completing') wfi.notifyRequestor( "The workflow %s was force completed for running too long" % wfo.name) max_force -= 1 else: sendEmail( 'completor', 'The workflow %s is ready for force complete, but completor is in safe mode' % wfo.name) else: wfi.sendLog( 'completor', "too many completion this round, cannot force complete") if set_force_complete: sendLog( 'completor', 'The followings were set force-complete \n%s' % ('\n'.join(set_force_complete))) #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text = "These have been running for long" #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) eosFile('%s/longlasting.json' % monitor_dir, 'w').write(json.dumps(long_lasting, indent=2)).close() for wf, info in sorted(long_lasting.items(), key=lambda tp: tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days" % (wf, delay) if 'completion' in info: text += " %d%%" % (info['completion'] * 100) print text
def transferor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() #NLI = newLockInfo() #if not NLI.free(): return LI = lockInfo() if not LI.free(): return mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).filter( ~Workflow.status.contains('custodial')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] max_per_round = UC.get('max_per_round').get('transferor', None) print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) all_to_include = session.query(Workflow).filter( Workflow.status.startswith('considered')).all() if len(cache) > 2000: max_to_include = max_per_round random.shuffle(cache) ## randomize first by wf name cache = sorted(cache, key=lambda r: r['RequestPriority'], reverse=True) ## order by prio highest = [r['RequestName'] for r in cache[:max_to_include]] all_to_include = [wfo for wfo in all_to_include if wfo.name in highest] print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len( all_to_include) for wfo in all_to_include: print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = defaultdict(float) ignored_input_sizes = defaultdict(float) input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read()) stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print wfo.name, "staging" (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() blocks = wfh.getBlocks() for prim in primary: ds_s = dss.get(prim, blocks=blocks) if prim in stucks: wfh.sendLog( 'transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s)) ignored_input_sizes[prim] = max(ds_s, ignored_input_sizes[prim]) else: input_sizes[prim] = max(ds_s, input_sizes[prim]) wfh.sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s)) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." input_blocks = {} for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() blocks = wfh.getBlocks() input_blocks[wfo.name] = blocks for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim, blocks=blocks) input_sizes[prim] = max(prim_size, input_sizes[prim]) primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendLog( "transferor", "No request in staging, using first request to set priority limit") if len(wfs_and_wfh): min_transfer_priority = wfs_and_wfh[0][1].request[ 'RequestPriority'] in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority'] else: return cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = defaultdict(float) went_over_budget = False destination_cache = {} no_goes = set() if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: if wfh.isRelval(): wfo.status = 'forget' else: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() blocks = input_blocks.get(wfo.name, wfh.getBlocks()) if blocks: print "Reading only", len(blocks), "blocks in input" this_load = sum([dss.get(prim, blocks=blocks) for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = {} overide_parameters = {} check_secondary = (not wfh.isRelval()) output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: overide_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'transferor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('transferor', 'These data tiers %s are not allowed in %s' % (','.join(banned_tier), wfo.name), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('transferor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('transferor', critical_msg, level='critical') if not options.go: no_go = True for sec in secondary: if sec in allowed_secondary: overide_parameters.update(allowed_secondary[sec]) if 'SiteWhitelist' in overide_parameters: sites_allowed = list( set(sites_allowed) & set(overide_parameters['SiteWhitelist'])) wfh.sendLog( 'transferor', 'Intersecting with the overriding whitelist parameters, allowed sites become {}' .format(sites_allowed)) if no_go: continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue # break ## try this for a while to make things faster ## the site white list considers site, campaign, memory and core information if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): LI.lock(dataset, reason='staging') if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) if blocks: print "limiting to blocks", "\n".join(sorted(blocks)) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be #prim_destination = [site for site in destinations.keys() if not site in prim_location] prim_destination = [ site for (site, info) in destinations.items() if info['data_fraction'] == 1 and info['completion'] != 100 ] ## veto the site with no current disk space, for things that are not relval prim_destination = [ site for site in prim_destination if (SI.disk[site] or wfh.isRelval()) ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "Counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers potential_destinations = len(prim_to_distribute) #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## do we want to restrict transfers if the amount of site in vetoe are too large ? wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: existings = session.query(TransferImp).filter( TransferImp.phedexid == int(latching)).filter( TransferImp.workflow_id == wfo.id).all() if not existings: tri = TransferImp(phedexid=int(latching), workflow=wfo) print "adding", wfo.id, "with phedexid", latching session.add(tri) else: for existing in existings: existing.active = True session.flush() can_go = False transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The input is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim)) sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim), level='critical') print json.dumps(prim_to_distribute, indent=2) print json.dumps(prim_location, indent=2) print json.dumps(prim_destination, indent=2) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] print "changed to" print json.dumps(prim_to_distribute, indent=2) if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) ## prune the blocks/destination that are already in the making, so that subscription don't overlap for site in spreading: for block in list(spreading[site]): if site in destinations and block in destinations[ site]['blocks'].keys(): ## prune it spreading[site].remove(block) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) else: can_go = False allowed = False if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] if 'SecondaryLocation' in overide_parameters: override_sec_destination = overide_parameters[ 'SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = set( [SI.CE_to_SE(site) for site in sites_allowed]) destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if k in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] print sec, json.dumps(destinations, indent=2) sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] ## this is in SE else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] ## how to make unified understand that it has to wait for the secondary if the sec_destination and #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in sec_location ] #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [ site for site in sec_to_distribute if not SI.CE_to_SE(site) in sec_destination ] presitespace_sec_to_distribute = copy.deepcopy( sec_to_distribute) #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] #sec_to_distribute = [site for site in sec_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] sec_to_distribute = [ site for site in sec_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## at this point you have a problem if len(sec_to_distribute) == 0 and len( presitespace_sec_to_distribute): sendLog( 'transferor', '%s is getting no possible destinations because of lack of space. To be decided what to do in general' % (sec), level='critical') if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size or wfh.isRelval(): wfh.sendLog('transferor', 'Sending %s to %s' % (sec, site)) all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog( 'transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024, wfo.name), level='critical') wfh.sendLog( 'transferor', '%s is too big (%s) for %s (%s). will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024)) else: ## this is bas overall print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog( 'transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(sorted(no_goes)), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets #if site in SI.sites_veto_transfer: # print site,"does not want transfers" # continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue transfered_items = defaultdict(set) if execute: priority = 'normal' cds = [ ds for ds in set(datasets + block_datasets) if ds in max_priority ] ## bucketize the transfers by priority of workflows prioritized_items = defaultdict(set) for item in items_to_transfer: d = item.split('#')[0] p = max_priority.get(d, 80000) q = 'normal' if p > 100000: q = 'reserved' elif p < 70000: q = 'low' prioritized_items[q].add(item) for priority, items in prioritized_items.items(): result = makeReplicaRequest(url, site_se, list(items), 'prestaging', priority=priority, approve=True) if result: these_transfers = [ o['id'] for o in result['phedex']['request_created'] ] #phedexids.extend( these_transfers ) for ph in these_transfers: transfered_items[ph].update(items) else: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items, site_se), level='critical') #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True) #phedexids = [o['id'] for o in result['phedex']['request_created']]: #else: # #result= {'phedex':{'request_created' : []}} # phedexids = [] # fake_id-=1 if not transfered_items: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items_to_transfer, site), level='critical') continue for phedexid, items in transfered_items.items(): print phedexid, "transfer created" for transfering in list( set(map(lambda it: it.split('#')[0], items))): for wfid in workflow_dependencies[transfering]: new_transfer = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.workflow_id == wfid).first() if not new_transfer: new_transfer = TransferImp( phedexid=phedexid, workflow=session.query(Workflow).get(wfid)) session.add(new_transfer) else: new_transfer.active = True wf_id_in_prestaging.add(wfid) #session.commit() for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" #session.commit() ## one big session commit at the end that everything went fine session.commit()
def assignor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "Intersecting with secondary requirement, now allowed %s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already closeAllBlocks(url, prim, blocks) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = set() for (psite, (there, frac)) in presence.items(): if there: sites_all_data.update(SI.SE_to_CEs(psite)) sites_all_data = list(sites_all_data) #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = set() for psite in presence.keys(): sites_with_any_data.update(SI.SE_to_CEs(psite)) sites_with_any_data = list(sites_with_any_data) #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()])) holding_but_not_allowed = set() for se_site in presence.keys(): if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)): holding_but_not_allowed.add(se_site) #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted(holding_but_not_allowed)) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_extreme_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs), "closing" th_start = time.mktime(time.gmtime()) for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue if not options.manual and ( 'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159' .lower() in (wfo.name).lower() or 'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986' .lower() in (wfo.name).lower()): continue closers.append( CloseBuster( wfo=wfo, url=url, CI=CI, UC=UC, jump_the_line=jump_the_line, batch_goodness=batch_goodness, batch_go=batch_go, #stats = stats, batch_warnings=batch_warnings, batch_extreme_warnings=batch_extreme_warnings, all_late_files=all_late_files, held=held, )) run_threads = ThreadHandler(threads=closers, n_threads=options.threads, sleepy=10, timeout=None, verbose=True, label='closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira', False) else None print len( run_threads.threads), "finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out session.add(outO) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid": to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop(to.wfo.name) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop - th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads / run_threads.n_threads) > 0: sendLog('checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads), level='critical') sendEmail( 'checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" #if batch_warnings[ bname ]: # issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness # issues+="\n".join( sorted( batch_warnings[ bname ] )) # issues+="\n\n" if batch_extreme_warnings[bname]: subject = "Low Statistics for %s" % bname issues = "The following datasets have outstanding completion (<50%%) issues:\n\n" issues += "\n".join(sorted(batch_extreme_warnings[bname])) issues += "\n\n" elif batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = "" text += "Dear all,\n\n" text += "A batch of release validation workflows has finished.\n\n" text += "Batch ID:\n\n" text += "%s\n\n" % (bname) text += "Detail of the workflows\n\n" text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % ( bname) text += "%s\n\n" % (issues) text += "This is an automated message.\n\n" text += "" to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname) deleteCampaignConfig(bname) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor', 'Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def completor(url, specific): mlock = moduleLock(silent=True) if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) safe_mode = False CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() JC = JIRAClient() if up.status.get('jira',False) else None wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('completor',None) if max_per_round and not specific: wfs = wfs[:max_per_round] all_stuck = set() ## take into account what stagor was saying for itry in range(5): try: all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir))) break except: time.sleep(2) for itry in range(5): try: ## take into account the block that needed to be repositioned recently all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] ) break except: time.sleep(2) ## take into account all stuck block and dataset from transfer team all_stuck.update( getAllStuckDataset()) good_fractions = {} overdoing_fractions = {} truncate_fractions = {} timeout = {} campaign_injection_delay = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'truncate-complete' in CI.campaigns[c]: truncate_fractions[c] = CI.campaigns[c]['truncate-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] if 'injection-delay' in CI.campaigns[c]: campaign_injection_delay[c] = CI.campaigns[c]['injection-delay'] if 'overdoing-complete' in CI.campaigns[c]: overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete'] long_lasting = {} WI = wtcInfo() overrides = WI.getForce() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print "can truncate complete on" print json.dumps( truncate_fractions ,indent=2) print "can overide on" print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") max_priority = UC.get("max_tail_priority") injection_delay_threshold = UC.get("injection_delay_threshold") injection_delay_priority = UC.get("injection_delay_priority") delay_priority_increase = UC.get("delay_priority_increase") default_fraction_overdoing = UC.get('default_fraction_overdoing') set_force_complete = set() # priority and time above which to fire a JIRA jira_priority_and_delays = { 110000 : 21, 90000 : 28, # 80000 : 60, #0 : 90 } for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False campaigns = wfi.getCampaigns() #if not any([c in good_fractions.keys() for c in campaigns]): skip=True #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True for user,spec in overrides.items(): if not spec: continue spec = filter(None, spec) if not wfi.request['RequestStatus'] in ['force-complete', 'completed']: if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue ## until we can map the output to task ... output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping good_fraction_per_out = {} good_fraction_nodelay_per_out = {} truncate_fraction_per_out = {} #allowed_delay_per_out = {} for task,outs in output_per_task.items(): task_campaign = wfi.getCampaignPerTask( task ) for out in outs: good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.) good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing) truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.) #allowed_delay_per_out[out] = timeout.get(task_campaign, 14) #print "force at", json.dumps( good_fraction_per_out, indent=2) #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2) now = time.mktime(time.gmtime()) / (60*60*24.) priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[])) if not priority_log: print "\tHas no priority log" priority_delay = 0 else: then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.) priority_delay = now - then ## in days print "priority was set to",priority,priority_delay,"[days] ago" running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" delay = 0 else: then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.) delay = now - then ## in days #further check on delays cpuh = wfi.getComputingTime(unit='d') wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay)) if priority_delay!=0 and priority_delay < delay: ## regardless when it started running, set the delay to when priority was changed last delay = priority_delay ## this is supposed to be the very initial request date, inherited from clones injection_delay = None original = wfi if 'OriginalRequestName' in original.request: ## go up the clone chain original = workflowInfo(url, original.request['OriginalRequestName']) injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition']) if injected_log: injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.) injection_delay = now - injected_on delay_for_priority_increase = injection_delay #delay_for_priority_increase = delay (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority pop_a_jira = False ping_on_jira = 7 *(24*60*60) # 7 days for jp,jd in jira_priority_and_delays.items(): if priority >= jp and delay >= jd: pop_a_jira = True if pop_a_jira and JC: j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'], priority = wfi.request['RequestPriority'], label = 'Late', reopen = True) last_time = JC.last_time( j ) since_last_ping = time.mktime(time.gmtime()) - last_time if since_last_ping > ping_on_jira or just_created: j_comment = "Running since %.1f [days] at priority %d"%( delay, priority) JC.comment(j.key, j_comment) if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority: quantized = 5000 ## quantize priority tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized tail_cutting_priority += 101 ## to signal it is from this mechanism tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value if priority < tail_cutting_priority: if max_priority: sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical') wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase)) reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority) max_priority-=1 else: sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical') wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase)) print "Could be changing the priority to higher value, but too many already were done" _,prim,_,_ = wfi.getIO() is_stuck = all_stuck & prim if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck)) monitor_delay = 7 allowed_delay = max([timeout.get(c,14) for c in campaigns]) monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early, just for the sake of not computing the completion fraction just now. # maybe this is fast enough that we can do it for all if delay <= monitor_delay: print "not enough time has passed yet" continue long_lasting[wfo.name] = { "delay" : delay, "injection_delay" : injection_delay } percent_completions = wfi.getCompletionFraction(caller='completor') if not percent_completions: sendLog('completor','%s has no output at all'% wfo.name, level='critical') continue is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay) is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay) is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions])) if is_over_completion: wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) )) elif is_over_allowed_delay: wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) )) elif is_over_truncation_delay: wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) ) ) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), json.dumps(good_fraction_per_out, indent=2), json.dumps( truncate_fraction_per_out, indent=2), json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue #for output in percent_completions: # completions[output]['injected'] = then ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days ## find ACDCs that might be running if max_force>0: print "going for force-complete of",wfo.name if not safe_mode: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name) else: wfi.sendLog('completor',"too many completion this round, cannot force complete") if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close() for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) print text
def actor(url, options=None): mlock = moduleLock(wait=False, silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor', "%s and children are rejected" % wfname) else: wfi.sendLog('actor', "Failed to reject the request and dependents") sendLog('actor', 'Failed to reject the familly of %s' % wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned into %s" % (wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog( 'actor', 'Force-completing from workflow traffic controler request') WI.add(action='force', keyword=wfname, user=action_list[wfname].get('user', 'unified')) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') WI.add(action='hold', keyword=wfname, user=action_list[wfname].get('user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') continue except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because there is no acdc doc for it anyways." % (fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in where_to_run[fulltaskname] ])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s" % ( action_list[wfname].get('user', 'unified'), task.split('/')[-1], json.dumps(actions), ) reason = action_list[wfname].get('Reason', None) if reason: jira_comment += '\ndue to: %s' % (reason) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s needs to be assigned by hand" % (acdc)) continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s failed to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s failed to get assigned for recovery" % acdc) else: wfi.sendLog('actor', "%s was assigned for recovery" % acdc) recovering.add(acdc) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find( {'prepid': wfi.request['PrepID']}) if len(jiras) == 1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog( 'actor', 'not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock' % (base_eos_dir, os.getpid())) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual', 'recovering') session.commit() if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def actor(url,options=None): mlock = moduleLock(wait=False ,silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor','Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps( action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps( sorted( wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle( wf_list ) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get( 'Parameters' , None) to_acdc = action_list[wfname].get( 'Action', None) == 'acdc' to_clone = action_list[wfname].get( 'Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['onhold','on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog('actor','Action submitted for something other than acdc, clone, bypass or hold for workflow %s'%wfname,level='critical') print json.dumps( action_list[wfname] , indent=2) continue if not tasks and to_acdc: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor',"%s and children are rejected"%wfname) else: wfi.sendLog('actor',"Failed to reject the request and dependents") sendLog('actor','Failed to reject the familly of %s'% wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned into %s"%(wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog('actor','Force-completing from workflow traffic controler request') WI.add(action='force', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) elif to_hold: wfi.sendLog('actor','Holding on workflow traffic controler request') WI.add(action='hold', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: wfi.sendLog('actor','WMErrors is blank for %s.'%wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') continue except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task= True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task= True wfi.sendLog('actor', "Skipping task %s because there is no acdc doc for it anyways."%(fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[fulltaskname]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s"%( action_list[wfname].get( 'user', 'unified'), task.split('/')[-1] , json.dumps(actions), ) reason = action_list[wfname].get( 'Reason', None) if reason: jira_comment += '\ndue to: %s'%(reason) #team = wfi.request['Teams'][0] team = 'production' parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s needs to be assigned by hand"%(acdc)) continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s failed to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s failed to get assigned for recovery"% acdc) else: wfi.sendLog('actor',"%s was assigned for recovery"% acdc) recovering.add( acdc ) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find({'prepid' : wfi.request['PrepID']}) if len(jiras)==1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog('actor','not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock'%( base_eos_dir, os.getpid() )) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual','recovering') session.commit() if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return