def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n",wfo.name,"\n\tto be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: print wfo.name,wfh.request['RequestStatus'],"setting away and skipping" ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" n_stalled+=1 continue #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() print "Site white list",sorted(sites_allowed) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) c_sites_allowed = CI.get(wfh.request['Campaign'], 'SiteWhitelist' , []) if c_sites_allowed: print "Would like to use the new whitelist, but will not until things went through a bit" sendEmail("using a restricted site white list","for %s"%(c_sites_allowed)) sites_allowed = list(set(sites_allowed) & set(c_sites_allowed)) c_black_list = CI.get(wfh.request['Campaign'], 'SiteBlacklist', []) if c_black_list: print "Reducing the whitelist due to black list in campaign configuration" print "Removing",c_black_list sites_allowed = list(set(sites_allowed) - set(c_black_list)) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] ncores = wfh.request.get('Multicore',1) memory_allowed = SI.sitesByMemory( wfh.request['Memory'] , maxCore=ncores) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"MB and",ncores,"core are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) print "Allowed",sorted(sites_allowed) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] print "From secondary requirement, now Allowed",sorted(sites_allowed) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] print "We could be running at",sorted(opportunistic_sites),"in addition" if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): print "One of the destination site is in downtime" down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if down_time and not options.go: wfo.status = 'considered' session.commit() print "sending back to considered because of site downtime, instead of waiting" sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## default back to white list to original white list with any data print "Allowed",sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed options.useSiteListAsLocation = True else: sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] print "Placing the output on", sites_out parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if options and options.team: team = options.team #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): ## consider SDSC parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] parameters['useSiteListAsLocation'] = True team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0: parameters['SiteWhitelist'] = ['T3_US_SDSC'] team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): print "Falling back to event splitting." sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) elif 'EventsPerJob' in split_check.values(): print "Modifying the number of job per event" sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: print "There is no go for assigning that request without event splitting" sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) print "need to go down to",eventsPerJob,"events per job" parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: print "need to go down to",lumisPerJob,"in assignment" sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: print "the regular splitting should work for",pstring sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" print "Assigned",n_assigned print "Stalled",n_stalled
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = False for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) if not primary_aaa: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) if 'parameters' in assign_parameters: parameters.update( assign_parameters['parameters'] ) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters #parameters.update( CI.parameters(wfh.request['Campaign']) ) parameters.update( assign_parameters.get('parameters',{}) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'NoGo' in split_check.values(): wfh.sendLog('assignor', "Failing splitting check") sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical') n_stalled+=1 continue if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical') ## we have a problem here, that EventBased should never be used as a backup if not options.go: n_stalled+=1 continue continue ## skip all together elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per job") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') elif 'EventsPerLumi' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this") # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat #NLI.lock( secure ) LI.lock( secure, reason = 'assigning') #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign. Please check the logs") print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) #if specific: # #wfos = session.query(Workflow).filter(Workflow.name==specific).all() # wfos = session.query(Workflow).filter(Workflow.name.contains(specific)).all() #if not wfos: # if specific: # wfos = session.query(Workflow).filter(Workflow.status=='considered').all() # wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) # wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: wfh.sendLog('assignor', "No go for %s" % wfh.request['Campaign']) n_stalled += 1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) sendEmail( "cannot be assigned due to downtime", "%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered." % wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status continue ## default back to white list to original white list with any data print "Allowed", sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendEmail("cannot be assigned", "%s has no whitelist" % (wfo.name)) n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") sendEmail( "Fallback to EventBased", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name) elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") sendEmail( "Modifying the job per events", "the workflow %s is too heavy in number of jobs explosion" % wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: sendEmail( "issue with event splitting for run-dependent MC", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: sendEmail( "setting lumi splitting for run-dependent MC", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: sendEmail("leaving splitting untouched for PU_RD*", "please check on " + wfo.name) wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def checkor(url, spec=None, options=None): fDB = falseDB() wfs=[] if options.fetch: #workflows = getWorkflows(url, status='completed') #for wf in workflows: # wfo = session.query(Workflow).filter(Workflow.name == wf ).first() # if wfo: # if not wfo.status in ['away','assistance']: continue # wfs.append(wfo ) wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) else: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign for wfo in wfs: if spec and not (spec in wfo.name): continue ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] for member in familly: if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestType'] != 'Resubmission': continue if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False ## completion check percent_completions = {} event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 300. campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent if not custodial and 'InputDataset' in wfi.request: parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" continue if not custodial: ## pick one at random custodial = SI.pick_SE() if custodial and not sub_assistance and not acdc: ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): custodials[custodial].append( output ) else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: for output in wfi.request['OutputDatasets']: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = len(acdc) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: reqMgrClient.closeOutWorkflowCascade(url, wfo.name) # set it from away/assistance* to close wfo.status = 'close' session.commit() else: print wfo.name,"needs assistance" ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it wfo.status = 'assistance'+sub_assistance if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() fDB.summary() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low') print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific or options.early: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) if specific: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered-tried').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) wfh.sendLog('assignor',"%s to be assigned"%wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False allowed_secondary = set() for campaign in wfh.getCampaigns(): if not CI.go( campaign ): wfh.sendLog('assignor',"No go for %s"%campaign) if not options.go: n_stalled+=1 no_go = True break if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)), level='critical') if not options.go: n_stalled+=1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite in SI.sites_not_ready for osite in opportunistic_sites])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed",sorted(sites_allowed) if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints",sorted(sites_allowed) #if options.partial: # continue if not len(sites_allowed): wfh.sendLog('assignor',"cannot be assign with no matched sites") #sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'%wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary)), level='critical') if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[ wfh.request['Campaign']]: primary_aaa = primary_aaa or CI.campaigns[ wfh.request['Campaign']]['primary_AAA'] secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[ wfh.request['Campaign']]: secondary_aaa = secondary_aaa or CI.campaigns[ wfh.request['Campaign']]['secondary_AAA'] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog( 'assignor', "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list( set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( 'assignor', 'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting' % wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( 'assignor', "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all()) if specific: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all()) dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read()) max_per_round = UC.get("max_per_round").get("assignor", None) max_cpuh_block = UC.get("max_cpuh_block") random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(","))): continue # if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog("assignor", "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]: allowed_secondary.update(CI.campaigns[campaign]["secondaries"]) if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary))) # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog( "assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)), level="critical", ) if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request["RequestStatus"] != "assignment-approved": if not options.test: wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request["RequestStatus"] wfo.status = "away" session.commit() continue else: print wfo.name, wfh.request["RequestStatus"] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog("assignor", "cannot decide on version number") n_stalled += 1 wfo.status = "trouble" session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", []) blocks = [] if "BlockWhitelist" in wfh.request: blocks = wfh.request["BlockWhitelist"] if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"]))) wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"] secondary_aaa = options.secondary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") # pass if secondary_aaa: # just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0] # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = "/store/mc" ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0] ] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog( "assignor", "Holding the data but not allowed %s" % sorted( list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) ), ) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( (set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]) ) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( "assignor", "One of the usable site is in downtime %s" % ([osite in SI.sites_not_ready for osite in opportunistic_sites]), ) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog("assignor", "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( "assignor", "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh), level="critical", ) wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "maxcopies" in CI.campaigns[wfh.request["Campaign"]] ): copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]): not_even_once = not all([available >= 1.0 for available in available_fractions.values()]) wfh.sendLog( "assignor", "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()), ) if down_time and not options.go and not options.early: wfo.status = "considered" session.commit() wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting") # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( "assignor", "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered." % (wfo.name), level="delay", ) continue # pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open("cannot_assign.json").read()) except: pass if ( not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial ): wfh.sendLog( "assignor", "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) known.append(wfo.name) open("cannot_assign.json", "w").write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == "considered": wfh.sendLog("assignor", "setting considered-tried") wfo.status = "considered-tried" session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog("assignor", "cannot be assign with no matched sites") sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical") n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith("T1")] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog("assignor", "Placing the output on %s" % sites_out) parameters = { "SiteWhitelist": sites_allowed, "NonCustodialSites": sites_out, "AutoApproveSubscriptionSites": list(set(sites_out)), "AcquisitionEra": wfh.acquisitionEra(), "ProcessingString": wfh.processingString(), "MergedLFNBase": set_lfn, "ProcessingVersion": version, } ## plain assignment here team = "production" if os.getenv("UNIFIED_TEAM"): team = os.getenv("UNIFIED_TEAM") if options and options.team: team = options.team if False and "T2_CH_CERN" in parameters["SiteWhitelist"]: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"] team = "hlt" ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and "," in v: parameters[key] = filter(None, v.split(",")) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog("assignor", "Setting the number of events per job to 500k max") parameters["EventsPerJob"] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request["Campaign"])) if not options.test: parameters["execute"] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if "EventBased" in split_check.values(): wfh.sendLog("assignor", "Falling back to event splitting.") # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name, level="critical", ) elif "EventsPerJob" in split_check.values(): wfh.sendLog("assignor", "Modifying the number of job per event") # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical" ) # Handle run-dependent MC pstring = wfh.processingString() if "PU_RD" in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if "PU_RD2" in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog( "assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level="critical", ) wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters["EventsPerJob"] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog( "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical" ) wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters["LumisPerJob"] = lumisPerJob else: # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( "assignor", "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level="critical", ) wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = "away" session.commit() n_assigned += 1 wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]: ## lock all outputs flat NLI.lock(secure) # for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): fDB = closeoutInfo() if userLock(): return if duplicateLock(): return UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=["mcm"]) if not up.check(): return use_mcm = up.status["mcm"] wfs = [] if options.fetch: ## get all in running and check wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all()) wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all()) if options.nofetch: ## than get all in need for assistance wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all()) custodials = defaultdict(list) # sites : dataset list transfers = defaultdict(list) # sites : dataset list invalidations = [] # a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split("/")[2].split("-")[0] except: if "Campaign" in wfi.request: campaign = wfi.request["Campaign"] return campaign by_passes = [] holdings = [] for bypassor, email in [ ("jbadillo", "*****@*****.**"), ("vlimant", "*****@*****.**"), ("jen_a", "*****@*****.**"), ]: bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor) if not os.path.isfile(bypass_file): print "no file", bypass_file continue try: by_passes.extend(json.loads(open(bypass_file).read())) except: print "cannot get by-passes from", bypass_file, "for", bypassor sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email]) holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor) if not os.path.isfile(holding_file): print "no file", holding_file continue try: holdings.extend(json.loads(open(holding_file).read())) except: print "cannot get holdings from", holding_file, "for", bypassor sendEmail( "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email] ) total_running_time = 5.0 * 60.0 sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep(sleep_time) print "checking on", wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request["RequestStatus"] if wfo.wm_status == "closed-out": ## manually closed-out print wfo.name, "is already", wfo.wm_status wfo.status = "close" session.commit() continue elif wfo.wm_status in [ "failed", "aborted", "aborted-archived", "rejected", "rejected-archived", "aborted-completed", ]: ## went into trouble wfo.status = "trouble" print wfo.name, "is in trouble", wfo.wm_status session.commit() continue elif wfo.wm_status in ["assigned", "acquired"]: ## not worth checking yet print wfo.name, "not running yet" session.commit() continue if "-onhold" in wfo.status: if wfo.name in holdings and wfo.name not in by_passes: print wfo.name, "on hold" continue if wfo.name in holdings and wfo.name not in by_passes: wfo.status = "assistance-onhold" print "setting", wfo.name, "on hold" session.commit() continue if wfo.wm_status != "completed" and not wfo.name in by_passes: ## for sure move on with closeout check if in completed print "no need to check on", wfo.name, "in status", wfo.wm_status session.commit() continue session.commit() sub_assistance = "" # if that string is filled, there will be need for manual assistance is_closing = True ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on", wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass", wfo.name, "because of keyword", bypass by_pass_checks = True break if not CI.go(wfi.request["Campaign"]) and not by_pass_checks: print "No go for", wfo.name continue # tuck out DQMIO/DQM wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request["PrepID"], details=True) acdc = [] acdc_inactive = [] has_recovery_going = False had_any_recovery = False for member in familly: if member["RequestType"] != "Resubmission": continue if member["RequestName"] == wfo.name: continue if member["RequestDate"] < wfi.request["RequestDate"]: continue if member["RequestStatus"] in [ "running-open", "running-closed", "assignment-approved", "assigned", "acquired", ]: print wfo.name, "still has an ACDC running", member["RequestName"] acdc.append(member["RequestName"]) # print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going = True elif member["RequestStatus"] == None: print member["RequestName"], "is not real" pass else: acdc_inactive.append(member["RequestName"]) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not "TotalInputEvents" in wfi.request: event_expected, lumi_expected = 0, 0 if not "recovery" in wfo.status: sendEmail( "missing member of the request", "TotalInputEvents is missing from the workload of %s" % wfo.name, destination=["*****@*****.**"], ) else: event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"] if "RequestNumEvents" in wfi.request: event_expected = int(wfi.request["RequestNumEvents"]) elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]: event_expected = int(wfi.request["Task1"]["RequestNumEvents"]) fractions_pass = {} for output in wfi.request["OutputDatasets"]: event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0.0 if lumi_expected: percent_completions[output] = lumi_count / float(lumi_expected) if event_expected: percent_completions[output] = max(percent_completions[output], event_count / float(event_expected)) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and "fractionpass" in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]["fractionpass"] print "overriding fraction to", fractions_pass[output], "for", output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to", fractions_pass[output], "by command line for", output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name, "is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance += "-recovering" elif had_any_recovery: ## we want to have this looked at sub_assistance += "-manual" else: sub_assistance += "-recovery" is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request["OutputDatasets"]: events_per_lumi[output] = getDatasetEventsPerLumi(output) lumi_upper_limit = {} for output in wfi.request["OutputDatasets"]: upper_limit = 301.0 campaign = get_campaign(output, wfi) # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]["lumisize"] print "overriding the upper lumi size to", upper_limit, "for", campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to", upper_limit, "by command line" lumi_upper_limit[output] = upper_limit if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name, "has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance += "-biglumi" is_closing = False any_presence = {} for output in wfi.request["OutputDatasets"]: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request["OutputDatasets"]: custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence = {} for output in wfi.request["OutputDatasets"]: phedex_presence[output] = phedexClient.getFileCountDataset(url, output) vetoed_custodial_tier = UC.get("tiers_with_no_custodial") out_worth_checking = [ out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier ] size_worth_checking = sum( [getDatasetSize(out) / 1023.0 for out in out_worth_checking] ) ## size in TBs of all outputs if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name, "has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:", custodial, "because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]["custodial"] print "Setting custodial to", custodial, "from campaign configuration" break if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the unified configuration custodial:", custodial, "because of limited space" custodial = None ## get from the parent pick_custodial = True if not custodial and "InputDataset" in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"]) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset", wfi.request[ "InputDataset" ], "does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"], ) is_closing = False pick_custodial = False if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:", custodial, "because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for", wfo.name sendEmail( "cannot find a custodial", "cannot find a custodial for %s probably because of the total output size %d" % (wfo.name, size_worth_checking), ) if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output] >= 1: custodials[custodial].append(output) else: print "no file in phedex for", output, " not good to add to custodial requests" is_closing = False ## disk copy disk_copies = {} for output in wfi.request["OutputDatasets"]: disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)] if not all(map(lambda sites: len(sites) != 0, disk_copies.values())): print wfo.name, "has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request["OutputDatasets"]: dbs_presence[output] = dbs3Client.getFileCountDataset(output) dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True) fraction_invalid = 0.01 if ( not all( [ dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out]) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if ( not all( [ (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out])) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance += "-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for", wfo.name for output in wfi.request["OutputDatasets"]: print "\tchecking", output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: print "was not possible to get the duplicate count for", output is_closing = False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name, "has duplicates" print json.dumps(duplications, indent=2) ## hook for making file invalidation ? sub_assistance += "-duplicates" is_closing = False ## for visualization later on if not wfo.name in fDB.record: # print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None} fDB.record[wfo.name]["closeOutWorkflow"] = is_closing for output in wfi.request["OutputDatasets"]: if not output in fDB.record[wfo.name]["datasets"]: fDB.record[wfo.name]["datasets"][output] = {} rec = fDB.record[wfo.name]["datasets"][output] rec["percentage"] = float("%.2f" % (percent_completions[output] * 100)) rec["duplicate"] = duplications[output] if output in duplications else "N/A" rec["phedexReqs"] = ( float("%.2f" % any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output]) != 0 else "N/A" ) rec["closeOutDataset"] = is_closing rec["transPerc"] = ( float("%.2f" % any_presence[output][disk_copies[output][0]][1]) if len(disk_copies[output]) != 0 else "N/A" ) rec["correctLumis"] = ( int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True ) rec["missingSubs"] = ( False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output]))) ) rec["dbsFiles"] = dbs_presence[output] rec["dbsInvFiles"] = dbs_invalid[output] rec["phedexFiles"] = phedex_presence[output] rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting", wfo.name, "closed-out" if not options.test: if wfo.wm_status in ["closed-out", "announced", "normal-archived"]: print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer", res if not res in ["None", None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None, "None"]: wfo.status = "close" session.commit() else: print "could not close out", wfo.name, "will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = "assistance" + sub_assistance print wfo.name, "needs assistance with", new_status if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status: pid = wfi.getPrepIDs()[0].replace("task_", "") # pid = wfi.request['PrepID'].replace('task_','') ## notify messages = { "recovery": "Samples completed with missing statistics:\n%s " % ( "\n".join( [ "%.2f %% complete for %s" % (percent_completions[output] * 100, output) for output in wfi.request["OutputDatasets"] ] ) ), "biglumi": "Samples completed with large luminosity blocks:\n%s " % ( "\n".join( [ "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request["OutputDatasets"] if (events_per_lumi[output] > lumi_upper_limit[output]) ] ) ), "duplicate": "Samples completed with duplicated luminosity blocks:\n%s" % ( "\n".join( [ "%s" % output for output in wfi.request["OutputDatasets"] if output in duplications and duplications[output] ] ) ), } text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name) content = "" for case in messages: if case in new_status: content += "\n" + messages[case] + "\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA("batches", query="contains=%s&status=announced" % pid) if len(batches): ## go notify the batch bid = batches[-1]["prepid"] print "batch nofication to", bid mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid}) ## go notify the request print "request notification to", pid mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]}) ## case where the workflow was in manual from recoveror if not "manual" in wfo.status or new_status != "assistance-recovery": wfo.status = new_status if not options.test: print "setting", wfo.name, "to", wfo.status session.commit() else: print "current status is", wfo.status, "not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ",".join(custodials[site]), "=>", site if not options.test: result = makeReplicaRequest( url, site, list(set(custodials[site])), "custodial copy at production close-out", custodial="y", priority="low", approve=(site in SI.sites_auto_approve), ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ",".join(transfers[site]), "=>", site if not options.test: result = None # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def parse_one(url, wfn, options=None): def time_point(label="", sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "[showError] Time check (%s) point at : %s" % (label, nows) print "[showError] Since start: %s [s]" % (now - time_point.start) if sub_lap: print "[showError] Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "[showError] Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) task_error_site_count = {} one_explanation = defaultdict(set) per_task_explanation = defaultdict(set) if wfn in [ 'vlimant_task_EXO-RunIISummer15wmLHEGS-04800__v1_T_170906_141738_1357' ]: return task_error_site_count, one_explanation time_point("Starting with %s" % wfn) threads = [] SI = global_SI() UC = unifiedConfiguration() wfi = workflowInfo(url, wfn) time_point("wfi", sub_lap=True) where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo() time_point("acdcinfo", sub_lap=True) all_blocks, needed_blocks_loc, files_in_blocks, files_and_loc_notin_dbs = wfi.getRecoveryBlocks( ) time_point("inputs", sub_lap=True) ancestor = workflowInfo(url, wfn) lhe, prim, _, sec = ancestor.getIO() high_order_acdc = 0 while ancestor.request['RequestType'] == 'Resubmission': ancestor = workflowInfo(url, ancestor.request['OriginalRequestName']) lhe, prim, _, sec = ancestor.getIO() high_order_acdc += 1 no_input = (not lhe) and len(prim) == 0 and len(sec) == 0 cache = options.cache print "cache timeout", cache err = wfi.getWMErrors(cache=cache) time_point("wmerrors", sub_lap=True) stat = wfi.getWMStats(cache=cache) time_point("wmstats", sub_lap=True) #adcd = wfi.getRecoveryDoc() total_by_code_dash = defaultdict(int) total_by_site_dash = defaultdict(int) r_dashb = defaultdict(lambda: defaultdict(int)) dash_board_h = 1 if False: ## NB get the since from when the wf has started, not a fixed value ## no dashboard until we get a better api #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache) dashb = {} #print json.dumps( dashb , indent=2) for site, sinfo in dashb.items(): for s_code, counts in sinfo.items(): d_statuses = ['submitted', 'pending', 'app-unknown', 'done'] total_by_code_dash[str(s_code)] += counts.get('submitted', 0) total_by_site_dash[site] += counts.get('submitted', 0) r_dashb[str(s_code)][site] += counts.get('submitted', 0) print json.dumps(total_by_code_dash, indent=2) print json.dumps(total_by_site_dash, indent=2) time_point("Got most input") status_per_task = defaultdict(lambda: defaultdict(int)) if not 'AgentJobInfo' in stat: stat['AgentJobInfo'] = {} #print "bad countent ?" #print json.dumps( stat, indent=2) for agent in stat['AgentJobInfo']: for task in stat['AgentJobInfo'][agent]['tasks']: if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']: info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][ status] #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] if type(info) == dict: status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'] [status].values()) else: status_per_task[task][status] += stat['AgentJobInfo'][ agent]['tasks'][task]['status'][status] #print json.dumps( status_per_task, indent=2) db_total_per_site = defaultdict(int) db_total_per_code = defaultdict(int) ## cannot do that since there is no task count in dashboard and we have to take away the submitted #for site in dashb: # for error in dashb[site]: # db_total_per_site[site] += dashb[site][error] # db_total_per_code[code] += dashb[site][error] print "ACDC Information" print "\t where to re-run" print json.dumps(where_to_run, indent=2) print "\t Missing events" print json.dumps(missing_to_run, indent=2) print "\t Missing events per site" print json.dumps(missing_to_run_at, indent=2) if not where_to_run and not missing_to_run and not missing_to_run_at: print "showError is unable to run" #return task_error_site_count, one_explanation pass do_JL = not options.no_JL do_CL = not options.no_CL do_all_error_code = options.all_errors if high_order_acdc >= 1: print high_order_acdc, "order request, pulling down all logs" do_all_error_code = True if wfi.isRelval(): print "getting all codes for relval" do_all_error_code = True tasks = sorted(set(err.keys() + missing_to_run.keys())) if not tasks: print "no task to look at" #return task_error_site_count html = "<html> <center><h1>%s, Updated on %s (GMT)" % ( wfn, time.asctime(time.gmtime())) html += '</center>' html += '<a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>dts</a>, ' % ( wfn) html += '<a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>ac</a>, ' % ( wfi.request['PrepID']) html += '<a href=https://cms-gwmsmon.cern.ch/prodview/%s>Job Progress</a>, ' % ( wfn) r_type = wfi.request.get('OriginalRequestType', wfi.request.get('RequestType', 'NaT')) if r_type in ['ReReco']: html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a>, ' % wfi.request[ 'PrepID'] html += '<a href="https://its.cern.ch/jira/issues/?jql=text~%s AND project = CMSCOMPPR" target="_blank">jira</a>' % ( wfi.request['PrepID']) html += '<hr>' html += '<a href=#IO>I/O</a>, <a href=#ERROR>Errors</a>, <a href=#BLOCK>blocks</a>, <a href=#FILE>files</a>, <a href=#CODES>Error codes</a><br>' html += '<hr>' time_point("Header writen") html += '<a name=IO></a>' if prim: html += 'Reads in primary<br>' rwl = wfi.getRunWhiteList() lwl = wfi.getLumiWhiteList() for dataset in prim: html += '<b>%s </b>(events/lumi ~%d)' % ( dataset, getDatasetEventsPerLumi(dataset)) blocks = getDatasetBlocks(dataset, runs=rwl) if rwl else None blocks = getDatasetBlocks(dataset, lumis=lwl) if lwl else None available = getDatasetBlocksFraction(url, dataset, only_blocks=blocks) html += '<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>' % available html += '<ul>' presence = getDatasetPresence(url, dataset, only_blocks=blocks) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul><br>' if sec: html += 'Reads in secondary<br>' for dataset in sec: presence = getDatasetPresence(url, dataset) html += '<b>%s</b><ul>' % dataset for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' outs = sorted(wfi.request['OutputDatasets']) if outs: html += 'Produces<br>' for dataset in outs: presence = getDatasetPresence(url, dataset) html += '<b>%s </b>(events/lumi ~ %d)<ul>' % ( dataset, getDatasetEventsPerLumi(dataset)) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' time_point("Input checked") html += """ <hr><br> <a name=ERROR></a> <ul> <li> <b><i>dashboard numbers over %d days</b></i> <li> ↑ %% with respect to total number of error in the code <li> → %% with respect to total number of error at the site </ul> """ % (dash_board_h) html += '<br>' n_expose_base = options.expose # if options else UC.get('n_error_exposed') print "getting", n_expose_base, "logs by default" if tasks: min_rank = min([task.count('/') for task in tasks]) for task in tasks: n_expose = n_expose_base expose_archive_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_archive_code')]) expose_condor_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_condor_code')]) #print task task_rank = task.count('/') task_short = task.split('/')[-1] total_per_site = defaultdict(int) time_point("Starting with task %s" % task_short, sub_lap=True) notreported = 'NotReported' total_count = defaultdict(int) error_site_count = defaultdict(lambda: defaultdict(int)) all_not_reported = set() for agent in stat['AgentJobInfo']: for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'skipped', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['skipped'][ site] #print info all_not_reported.add(site) ce = SI.SE_to_CE(site) error_site_count[notreported][ce] += info.get( 'skippedFiles', 0) total_count[notreported] += info.get('skippedFiles', 0) for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'sites', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][ site] for s in ['success', 'failure', 'cooloff', 'submitted']: if not s in info: continue data = info[s] if type(data) == dict: total_per_site[site] += sum(data.values()) else: total_per_site[site] += data #is the task relevant to recover (discard log, cleanup) if any([v in task.lower() for v in ['logcol', 'cleanup']]): continue #total_count= defaultdict(int) #error_site_count = defaultdict( lambda : defaultdict(int)) if not task in err: print task, "has not reported error" err[task] = {} #print err[task].keys() for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count error_site_count[errorcode_s][ce] += count ## show the total all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) s_per_code = defaultdict(int) for site in all_sites: for code in sorted(all_codes): s_per_code[code] += error_site_count[code][site] expose_top_N = UC.get('expose_top_N') count_top_N = min( sorted(s_per_code.values(), reverse=True)[:expose_top_N]) if s_per_code else -1 for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() force_code = (count_top_N > 0 and s_per_code[errorcode_s] >= count_top_N) if force_code: print "will expose", errorcode_s, "anyways" for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] ###total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count ###error_site_count[errorcode_s][ce] += count for sample in err[task][exittype][errorcode_s][site][ 'samples']: #print sample.keys() for step in sample['errors']: for report in sample['errors'][step]: if report['type'] == 'CMSExeption': continue #if int(report['exitCode']) == int(errorcode_s): one_explanation[errorcode_s].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) per_task_explanation[ "%s:%s" % (task_short, errorcode_s)].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) #one_explanation[errorcode_s].add( report['details'] ) #else: #one_explanation[ agent = sample['agent_name'] wmbs = sample['wmbsid'] workflow = sample['workflow'] if force_code: if not errorcode_s in expose_condor_code: expose_condor_code[errorcode_s] = defaultdict( lambda: n_expose) if not errorcode_s in expose_archive_code: expose_archive_code[errorcode_s] = defaultdict( lambda: n_expose) if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) ) and 'cern' in agent: if errorcode_s in expose_condor_code: expose_condor_code[errorcode_s][agent] -= 1 print errorcode_s, agent, "error count", expose_condor_code.get( errorcode_s, {}).get(agent, 0) threads.append( AgentBuster(agent=agent, workflow=workflow, wmbs=wmbs, errorcode_s=errorcode_s, base_eos_dir=base_eos_dir, monitor_eos_dir=monitor_eos_dir, task_short=task_short)) for out in sample['output']: #print out if out['type'] == 'logArchive': if do_JL and ( (errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent] > 0)): if errorcode_s in expose_archive_code: expose_archive_code[errorcode_s][ agent] -= 1 print errorcode_s, agent, "error count", expose_archive_code.get( errorcode_s, {}).get(agent, 0) threads.append( XRDBuster( out_lfn=out['lfn'], monitor_eos_dir=monitor_eos_dir, wfn=wfn, errorcode_s=errorcode_s, task_short=task_short, from_eos=( not options.not_from_eos ), # if options else True), )) #print task #print json.dumps( total_count, indent=2) #print json.dumps( explanations , indent=2) all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) ## parse the dashboard data for site in total_by_site_dash: ## no. cannot discriminate by task in dashboard... #all_sites.add( site ) pass ## parse the acdc data #notreported='NotReported' #all_missing_stats = set() #for site in missing_to_run_at[task] if task in missing_to_run_at else []: # if not missing_to_run_at[task][site]: continue # ce = SI.SE_to_CE( site ) # #all_sites.add( ce ) # all_missing_stats.add( ce ) #all_missing_stats = all_missing_stats &set(SI.all_sites) #all_not_reported = all_missing_stats - all_sites #print task #print "site with no report",sorted(all_not_reported) #print sorted(all_sites) #print sorted(all_missing_stats) #all_sites = all_missing_stats | all_sites #all_sites = all_sites & set(SI.all_sites) no_error = len(all_not_reported) != 0 if not no_error and notreported in all_codes: all_codes.remove(notreported) missing_events = missing_to_run[task] if task in missing_to_run else 0 feff = wfi.getFilterEfficiency(task.split('/')[-1]) html += "<a name=%s>" % task.split('/')[-1] html += "<b>%s</b>" % task.split('/')[-1] if missing_events: if feff != 1.: html += ' is missing %s events in input and <b>about %s events in output</b>' % ( "{:,}".format(missing_events), "{:,}".format( int(missing_events * feff))) else: html += ' is missing <b>%s events in I/O</b>' % ( "{:,}".format(missing_events)) html += ' <a href="https://cmsweb.cern.ch/couchdb/acdcserver/_design/ACDC/_view/byCollectionName?key=%%22%s%%22&include_docs=true&reduce=false" target=_blank>AC/DC</a>' % ( wfn) if no_error: html += "<br><b><font color=red> and has UNreported error</font></b>" html += "<br><table border=1><thead><tr><th>Sites/Errors</th>" #for site in all_sites: # html+='<th>%s</th>'%site for code in sorted(all_codes): #html+='<th><a href="#%s">%s</a>'%(code,code) html += '<th><a href="#%s:%s">%s</a>' % (task_short, code, code) if (str(code) in expose_archive_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/joblogs/%s/%s/%s>, JobLog</a>' % ( unified_url_eos, wfn, code, task_short) if (str(code) in expose_condor_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/condorlogs/%s/%s/%s>, CondorLog</a>' % ( unified_url_eos, wfn, code, task_short) html += '</th>' html += '<th>Total jobs</th><th>Site Ready</th>' html += '</tr></thead>\n' html += '<tr><td>Total</td>' for code in sorted(all_codes): html += '<td bgcolor=orange width=100>%d' % (s_per_code[code]) if code in total_by_code_dash: html += ' (<b><i>%d</i></b>)' % total_by_code_dash[code] html += '</td>' ulist = '<ul>' grand = 0 for status in sorted(status_per_task[task].keys()): ulist += '<li> %s %d' % (status, status_per_task[task][status]) grand += status_per_task[task][status] ulist += '<li><b> Total %d </b>' % grand ulist += '</ul>' #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.) html += '<td bgcolor=orange> → %.2f%% ← </td>' % ( 100. * (float(sum(s_per_code.values())) / grand) if grand else 0.) html += '<td bgcolor=orange> %s </td>' % ulist html += '</tr>' def palette(frac): _range = { 0.0: 'green', 0.5: 'green', 0.6: 'darkgreen', 0.7: 'orange', 0.8: 'salmon', 0.9: 'red' } which = [k for k in _range.keys() if k <= frac] if which: there = max(which) else: there = max(_range.keys()) return _range[there] for site in sorted(all_sites): site_in = 'Yes' color = 'bgcolor=lightblue' if not site in SI.sites_ready: color = 'bgcolor=indianred' site_in = '<b>No</b>' if task in missing_to_run_at and missing_to_run_at[task][ SI.CE_to_SE(site)] == 0 or min_rank == task_rank: color = 'bgcolor=aquamarine' site_in = '<b>No</b> but fine' if not no_error: site_in += " (%s events)" % ("{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)]) if task in missing_to_run_at else '--') html += '<tr><td %s>%s</td>' % (color, site) for code in sorted(all_codes): if code == notreported: html += '<td %s width=200>%s events </td>' % ( color, "{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)])) else: if error_site_count[code][site]: er_frac = float( error_site_count[code][site] ) / s_per_code[code] if s_per_code[code] else 0. si_frac = float( error_site_count[code][site]) / total_per_site[ site] if total_per_site[site] else 0. html += '<td %s width=200>%d' % ( color, error_site_count[code][site]) if code in r_dashb and site in r_dashb[code]: html += ' (<b><i>%d</i></b>)' % ( r_dashb[code][site]) html += ', <font color=%s>↑ %.1f%%</font>, <font color=%s>→ %.1f%%</font></td>' % ( palette(er_frac), 100. * er_frac, palette(si_frac), 100. * si_frac) else: html += '<td %s>0</td>' % color html += '<td bgcolor=orange>%d</td>' % total_per_site[site] html += '<td %s>%s</td>' % (color, site_in) html += '</tr>\n' html += '</table><br>' task_error_site_count[task] = error_site_count ## run all retrieval run_threads = ThreadHandler( threads=threads, n_threads=options.log_threads, # if options else 5, sleepy=10, timeout=UC.get('retrieve_errors_timeout'), verbose=True) run_threads.start() html += '<hr><br>' html += '<a name=BLOCK></a>' html += "<b>Blocks (%d/%d) needed for recovery</b><br>" % ( len(needed_blocks_loc), len(all_blocks)) for block in sorted(needed_blocks_loc.keys()): html += '%s <b>@ %s</b><br>' % (block, ','.join( sorted(needed_blocks_loc[block]))) html += '<a name=FILE></a>' html += "<br><b>Files in no block</b><br>" rthreads = [] check_files = [f for f in files_and_loc_notin_dbs.keys() if '/store' in f] random.shuffle(check_files) check_files = check_files[:100] check_files = [] ## disable it completely by_f = {} if check_files: for f in check_files: rthreads.append(ReadBuster(file=f)) print "checking on existence of", len(rthreads), "files" run_rthreads = ThreadHandler(threads=rthreads, n_threads=20, timeout=10) run_rthreads.start() while run_rthreads.is_alive(): time.sleep(10) for t in run_rthreads.threads: by_f[t.file] = t.readable #print "checked",t.file,t.readable for f in sorted(files_and_loc_notin_dbs.keys()): readable = by_f.get(f, -1) if readable == -1: fs = '%s' % f elif readable == 0: fs = '<font color=light green>%s</font>' % f #print f,"is readable" else: fs = '<font color=red>%s</font>' % f #print f,"is not readable" html += '%s <b>@</b> %s<br>' % (fs, ','.join( sorted(files_and_loc_notin_dbs[f]))) #html +='%s <b>@</b> %s<br>'%(f , ','.join(sorted(files_and_loc_notin_dbs[f])) ) html += '<hr><br>' html += '<a name=CODES></a>' html += '<table border=1>' for code in per_task_explanation: html += '<tr><td><a name="%s">%s</a><br><a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes>code twiki</a></td><td>%s</td></tr>' % ( code, code, '<br><br>'.join(per_task_explanation[code]).replace( '\n', '<br>')) #for code in one_explanation: # html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' )) html += '</table>' html += ('<br>' * 30) html += '</html>' time_point("Report finished") wfi.sendLog('error', html, show=False) fn = '%s' % wfn time_point("error send to ES") open('%s/report/%s' % (monitor_dir, fn), 'w').write(html) open('%s/report/%s' % (monitor_eos_dir, fn), 'w').write(html) time_point("Finished with showError") ## then wait for the retrivals to complete ping = 0 while run_threads.is_alive(): ping += 1 if ping % 100: time_point("waiting for sub-threads to finish") time.sleep(6) time_point("Finished with retrieval threads") return task_error_site_count, one_explanation
def checkor(url, spec=None, options=None): fDB = closeoutInfo() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.fetch: ## get all in running and check wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.nofetch: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign by_passes = [] holdings = [] for bypassor,email in [('jbadillo','*****@*****.**'),('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): print "no file",bypass_file continue try: by_passes.extend( json.loads(open(bypass_file).read())) except: print "cannot get by-passes from",bypass_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): print "no file",holding_file continue try: holdings.extend( json.loads(open(holding_file).read())) except: print "cannot get holdings from",holding_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) total_running_time = 5.*60. sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) print "checking on",wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings: print wfo.name,"on hold" continue if wfo.name in holdings: wfo.status = 'assistance-onhold' print "setting",wfo.name,"on hold" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on",wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass",wfo.name,"because of keyword",bypass by_pass_checks = True break # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] has_recovery_going=False had_any_recovery = False for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going=True elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance+='-recovering' elif had_any_recovery: ## we want to have this looked at sub_assistance+='-manual' else: sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = get_campaign(output, wfi) #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent pick_custodial = True if not custodial and 'InputDataset' in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( wfi.request['InputDataset']) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%wfi.request['InputDataset']) is_closing = False pick_custodial = False if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE() if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) else: print "no file in phedex for",output," not good to add to custodial requests" else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() else: print "could not close out",wfo.name,"will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = 'assistance'+sub_assistance print wfo.name,"needs assistance with",new_status if sub_assistance and wfo.status != new_status and 'PrepID' in wfi.request and not 'manual' in wfo.status: pid = wfi.getPrepIDs()[0].replace('task_','') #pid = wfi.request['PrepID'].replace('task_','') ## notify messages= { 'recovery' : 'Samples completed with missing lumi count:\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ) ), 'biglumi' : 'Samples completed with large luminosity blocks:\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])])), 'duplicate' : 'Samples completed with duplicated luminosity blocks:\n%s'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), } text ="The request %s (%s) is facing issue in production.\n" %( pid, wfo.name ) content = "" for case in messages: if case in new_status: content+= "\n"+messages[case]+"\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA('batches',query='contains=%s&status=announced'%pid) if len(batches): ## go notify the batch bid = batches[-1]['prepid'] print "batch nofication to",bid mcm.put('/restapi/batches/notify', { "notes" : text, "prepid" : bid}) ## go notify the request print "request notification to",pid mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid] }) ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations