def batchor(url): UC = unifiedConfiguration() SI = global_SI() CI = campaignInfo() BI = batchInfo() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain')) wfs = filter( lambda r: r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r: r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:", wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add(wf['PrepID']) for wf in hi_wfs: print "HI Relval:", wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add(wf['PrepID']) default_setup = { "go": True, "parameters": { "SiteWhitelist": ["T1_US_FNAL"], "MergedLFNBase": "/store/relval", "Team": "relval", "NonCustodialGroup": "RelVal" }, "custodial_override": "notape", "phedex_group": "RelVal", "lumisize": -1, "fractionpass": 0.0, "maxcopies": 1 } default_hi_setup = copy.deepcopy(default_setup) add_on = {} relval_routing = UC.get('relval_routing') def pick_one_site(p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len( p["parameters"]["SiteWhitelist"]) > 1: choose_from = list( set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice(choose_from) print "picked", picked, "from", choose_from p["parameters"]["SiteWhitelist"] = [picked] batches = BI.all() for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_setup) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword", key print "with", augment_with setup = deep_update(setup, augment_with) pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_campaign[campaign]) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_hi_setup) possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"]) hi_site = random.choice(list(possible_sites)) setup["parameters"]["SiteWhitelist"] = [hi_site] pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the HI relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_hi_campaign[campaign]) ## only new campaigns in announcement for new_campaign in list( set(add_on.keys()) - set(CI.all(c_type='relval'))): ## this is new, and can be announced as such print new_campaign, "is new stuff" subject = "Request of RelVal samples batch %s" % new_campaign text = """Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message""" % ( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor', text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in CI.all(c_type='relval'): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) if not all_in_batch: continue is_batch_done = all( map( lambda s: not s in [ 'completed', 'force-complete', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [wf['RequestStatus'] for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable CI.pop(old_campaign) ## or just drop it all together ? BI.pop(old_campaign) print "batch", old_campaign, " configuration was removed" ## merge all anyways CI.update(add_on, c_type='relval')
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = False for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) if not primary_aaa: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) if 'parameters' in assign_parameters: parameters.update( assign_parameters['parameters'] ) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters #parameters.update( CI.parameters(wfh.request['Campaign']) ) parameters.update( assign_parameters.get('parameters',{}) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'NoGo' in split_check.values(): wfh.sendLog('assignor', "Failing splitting check") sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical') n_stalled+=1 continue if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical') ## we have a problem here, that EventBased should never be used as a backup if not options.go: n_stalled+=1 continue continue ## skip all together elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per job") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') elif 'EventsPerLumi' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this") # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat #NLI.lock( secure ) LI.lock( secure, reason = 'assigning') #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign. Please check the logs") print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def spawn_harvesting(url, wfi, sites_for_DQMHarvest): SI = global_SI() all_OK = {} requests = [] outputs = wfi.request['OutputDatasets'] if ('EnableHarvesting' in wfi.request and not wfi.request['EnableHarvesting']) and ( 'DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']): if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('screwed up wl cache', '%s wl cache is bad' % (wfi.request['RequestName'])) all_OK['fake'] = False return all_OK, requests wfi = workflowInfo(url, wfi.request['RequestName']) dqms = [out for out in outputs if '/DQM' in out] #if not all([in_full[dqm_input] for dqm_input in dqms]): # wfi.sendLog('closor',"will not be able to assign the harvesting: holding up") # for dqm_input in dqms: # all_OK[dqm_input] = False ## raise the subscription to high priority for dqm_input in dqms: ## handle it properly harvesting_schema = { 'Requestor': os.getenv('USER'), 'RequestType': 'DQMHarvest', 'Group': 'DATAOPS' } copy_over = [ 'AcquisitionEra', 'ProcessingString', 'DQMUploadUrl', 'CMSSWVersion', 'CouchDBName', 'CouchWorkloadDBName', 'ConfigCacheUrl', 'DbsUrl', 'inputMode', 'DQMConfigCacheID', 'OpenRunningTimeout', 'ScramArch', 'CMSSWVersion', 'Campaign', 'Memory', #dummy 'SizePerEvent', #dummy 'GlobalTag', #dummy ] for item in copy_over: if item in wfi.request: harvesting_schema[item] = copy.deepcopy(wfi.request[item]) else: print item, "is not in initial schema" harvesting_schema['InputDataset'] = dqm_input harvesting_schema['TimePerEvent'] = 1 harvesting_schema['PrepID'] = 'Harvest-' + wfi.request['PrepID'] if len(wfi.request['RequestString']) > 60: wfi.request['RequestString'] = wfi.request[ 'RequestString'][:60] print "truncating request string", wfi.request['RequestString'] harvesting_schema[ 'RequestString'] = 'HARVEST-' + wfi.request['RequestString'] harvesting_schema['DQMHarvestUnit'] = 'byRun' harvesting_schema['RequestPriority'] = min( wfi.request['RequestPriority'] * 10, 999999) harvest_request = reqMgrClient.submitWorkflow( url, harvesting_schema) if not harvest_request: print "Error in making harvesting for", wfi.request[ 'RequestName'] print "schema" print json.dumps(harvesting_schema, indent=2) harvest_request = reqMgrClient.submitWorkflow( url, harvesting_schema) if not harvest_request: print "Error twice in harvesting for", wfi.request[ 'RequestName'] print "schema" print json.dumps(harvesting_schema, indent=2) if harvest_request: requests.append(harvest_request) ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely data = reqMgrClient.setWorkflowApproved(url, harvest_request) print "created", harvest_request, "for harvesting of", dqm_input wfi.sendLog( 'closor', "created %s for harvesting of %s" % (harvest_request, dqm_input)) ## assign it directly team = wfi.request['Team'] parameters = { 'SiteWhitelist': [ SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites'] ], 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], 'execute': True } #if in_full[dqm_input]: # print "using full copy at",in_full[dqm_input] # parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]] #else: # print "cannot do anything if not having a full copy somewhere" # all_OK[dqm_input]=False # continue parameters['SiteWhitelist'] = sites_for_DQMHarvest result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters) if not result: #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog( 'closor', '%s was created at announcement of %s in %s, failed to assign' % (harvest_request, dqm_input, wfi.request['RequestName'])) sendLog( 'closor', '%s was created at announcement of %s in %s, failed to assign' % (harvest_request, dqm_input, wfi.request['RequestName']), level='critical') else: #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog( 'closor', '%s was created at announcement of %s in %s, and assigned' % (harvest_request, dqm_input, wfi.request['RequestName'])) else: #print "could not make the harvesting for",wfo.name,"not announcing" wfi.sendLog('closor', "could not make the harvesting request") sendLog('closor', "could not make the harvesting request for %s" % wfi.request['RequestName'], level='critical') all_OK[dqm_input] = False return (all_OK, requests)
def batchor( url ): UC = unifiedConfiguration() SI = global_SI() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') ) wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] #by_campaign[wf['Campaign']].add( wf['RequestName'] ) by_campaign[wf['Campaign']].add( wf['PrepID'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] #by_hi_campaign[wf['Campaign']].add( wf['RequestName'] ) by_hi_campaign[wf['Campaign']].add( wf['PrepID'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial" : "T1_US_FNAL_MSS", "custodial_override" : ["DQMIO"], "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} batches = json.loads( open('batches.json').read() ) relval_routing = UC.get('relval_routing') def pick_one_site( p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1: choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice( choose_from ) print "picked",picked,"from",choose_from p["parameters"]["SiteWhitelist"] = [picked] for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_setup ) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword",key print "with",augment_with setup = deep_update( setup, augment_with ) #if 'cc7' in campaign: setup["parameters"]["SiteWhitelist"] = ["T2_US_Nebraska"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] )) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"]) setup["parameters"]["SiteWhitelist"]=[ hi_site ] #setup["parameters"]["SiteWhitelist"]=["T1_DE_KIT","T1_FR_CCIN2P3"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] )) open('batches.json','w').write( json.dumps( batches , indent=2 ) ) ## open the campaign configuration campaigns = json.loads( open('campaigns.relval.json').read() ) ## protect for overwriting ?? for new_campaign in list(set(add_on.keys())-set(campaigns.keys())): ## this is new, and can be announced as such print new_campaign,"is new stuff" subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor',text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in campaigns.keys(): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) is_batch_done = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable campaigns.pop( old_campaign ) ## or just drop it all together ? print "batch",old_campaign," configuration was removed" ## merge all anyways campaigns.update( add_on ) ## write it out for posterity open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2)) ## read back rread = json.loads(open('campaigns.json.updated').read()) os.system('mv campaigns.json.updated campaigns.relval.json')
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def parse_one(url, wfn, options=None): SI = global_SI() wfi = workflowInfo( url , wfn) where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() all_blocks,needed_blocks,files_in_blocks,files_notin_dbs = wfi.getRecoveryBlocks() ancestor = workflowInfo( url , wfn) lhe,prim,_,sec = ancestor.getIO() high_order_acdc = 0 while ancestor.request['RequestType'] == 'Resubmission': ancestor = workflowInfo(url, ancestor.request['OriginalRequestName']) lhe,prim,_,sec = ancestor.getIO() high_order_acdc += 1 no_input = (not lhe) and len(prim)==0 and len(sec)==0 cache = 0 if options: cache = options.cache print "cache timeout", cache err= wfi.getWMErrors(cache=cache) stat = wfi.getWMStats(cache=cache) #adcd = wfi.getRecoveryDoc() total_by_code_dash = defaultdict( int ) total_by_site_dash = defaultdict( int ) r_dashb =defaultdict( lambda : defaultdict( int )) dash_board_h = 1 if True :#'pdmvserv_TOP-RunIISummer15wmLHEGS-00103_00183_v0__161005_165048_809' in wfn: ## NB get the since from when the wf has started, not a fixed value ## no dashboard until we get a better api #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache) dashb = {} #print json.dumps( dashb , indent=2) for site,sinfo in dashb.items(): for s_code,counts in sinfo.items(): d_statuses = ['submitted','pending','app-unknown','done'] total_by_code_dash[str(s_code)]+= counts.get('submitted',0) total_by_site_dash[site] += counts.get('submitted',0) r_dashb[str(s_code)][site] += counts.get('submitted',0) print json.dumps(total_by_code_dash , indent=2) print json.dumps(total_by_site_dash , indent=2) status_per_task = defaultdict(lambda : defaultdict(int)) if not 'AgentJobInfo' in stat: stat['AgentJobInfo'] = {} #print "bad countent ?" #print json.dumps( stat, indent=2) for agent in stat['AgentJobInfo']: for task in stat['AgentJobInfo'][agent]['tasks']: if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']: info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] if type(info)==dict: status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'][status].values()) else: status_per_task[task][status] += stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] #print json.dumps( status_per_task, indent=2) db_total_per_site = defaultdict(int) db_total_per_code = defaultdict(int) ## cannot do that since there is no task count in dashboard and we have to take away the submitted #for site in dashb: # for error in dashb[site]: # db_total_per_site[site] += dashb[site][error] # db_total_per_code[code] += dashb[site][error] print "ACDC Information" print json.dumps( where_to_run , indent=2) print json.dumps(missing_to_run , indent=2) print json.dumps(missing_to_run_at , indent=2) task_error_site_count ={} one_explanation = defaultdict(set) do_JL = True do_CL = True do_all_error_code = False if options: do_JL = not options.no_JL do_CL = not options.no_CL do_all_error_code = options.all_errors if high_order_acdc>=1: print high_order_acdc,"order request, pulling down all logs" do_all_error_code = True n_expose = 1 if options: n_expose = options.expose expose_archive_code = {'134':defaultdict(lambda : n_expose),#seg fault '139':defaultdict(lambda : n_expose),# ??? '99109':defaultdict(lambda : n_expose),#stageout '99303' : defaultdict(lambda : n_expose),#no pkl report. if you are lucky '60450' : defaultdict(lambda : n_expose),#new '50513':defaultdict(lambda : n_expose),#new '8001': defaultdict(lambda : n_expose),# the usual exception in cmsRun '11003': defaultdict(lambda : n_expose),# job extraction '73': defaultdict(lambda : n_expose),# job extraction } expose_condor_code = {'99109':defaultdict(lambda : n_expose),#stageout '99303':defaultdict(lambda : n_expose),#no pkl report '60450':defaultdict(lambda : n_expose),#new '50513':defaultdict(lambda : n_expose),#new '11003': defaultdict(lambda : n_expose), } tasks = sorted(set(err.keys() + missing_to_run.keys())) if not tasks: print "no task to look at" #return task_error_site_count html="<html> <center><h1><a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>%s</a><br><a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>%s</a><br>"%( wfn, wfn, wfi.request['PrepID'], wfi.request['PrepID'] ) if wfi.request['RequestType'] in ['ReReco']: html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a><br>'% wfi.request['PrepID'] html+= '</center><hr>' if prim: html+='Reads in primary<br>' for dataset in prim: html +='<b>%s</b>'%dataset available = getDatasetBlocksFraction(url, dataset) html +='<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>'% available html +='<ul>' presence = getDatasetPresence(url, dataset) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%'%( site, presence[site][1] ) html+='</ul><br>' if sec: html+='Reads in secondary<br>' for dataset in sec: presence = getDatasetPresence(url, dataset) html +='<b>%s</b><ul>'%dataset for site in sorted(presence.keys()): html += '<li>%s : %.2f %%'%( site, presence[site][1] ) html+='</ul>' html += "Updated on %s (GMT)" % ( time.asctime(time.gmtime()) ) html += """ <ul> <li> <b><i>dashboard numbers over %d days</b></i> <li> ↑ %% with respect to total number of error in the code <li> → %% with respect to total number of error at the site </ul> """%(dash_board_h) html += '<hr><br>' if tasks: min_rank = min([task.count('/') for task in tasks]) for task in tasks: #print task task_rank = task.count('/') task_short = task.split('/')[-1] total_per_site = defaultdict(int) for agent in stat['AgentJobInfo']: if not task in stat['AgentJobInfo'][agent]['tasks']: continue if not 'sites' in stat['AgentJobInfo'][agent]['tasks'][task]:continue for site in stat['AgentJobInfo'][agent]['tasks'][task]['sites']: info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][site] #if site in ['T2_BE_IIHE']: print task,json.dumps( info, indent=2) #print info.keys() for s in ['success','failure','cooloff','submitted']: if not s in info: continue data = info[s] #print s,data if type(data)==dict: total_per_site[site] += sum( data.values() ) else: total_per_site[site] += data #is the task relevant to recover (discard log, cleanup) if any([v in task.lower() for v in ['logcol','cleanup']]): continue total_count= defaultdict(int) error_site_count = defaultdict( lambda : defaultdict(int)) if not task in err: print task,"has not reported error" err[task] = {} #print err[task].keys() for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0' : continue #print "\t\t",err[task][exittype][errorcode_s].keys() for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site]['errorCount'] total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count error_site_count[errorcode_s][ce] += count for sample in err[task][exittype][errorcode_s][site]['samples']: #print sample.keys() for step in sample['errors']: for report in sample['errors'][step]: if report['type'] == 'CMSExeption': continue #if int(report['exitCode']) == int(errorcode_s): one_explanation[errorcode_s].add("%s (Exit code: %s) \n%s"%(report['type'], report['exitCode'], report['details'])) #one_explanation[errorcode_s].add( report['details'] ) #else: #one_explanation[ agent = sample['agent_name'] wmbs = sample['wmbsid'] workflow = sample['workflow'] if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) or do_all_error_code) and 'cern' in agent: os.system('ssh %s %s/WmAgentScripts/Unified/exec_expose.sh %s %s %s %s %s %s'%( agent, base_dir, workflow, wmbs, errorcode_s, base_dir, monitor_dir, task_short)) if errorcode_s in expose_condor_code: expose_condor_code[errorcode_s][agent]-=1 for out in sample['output']: #print out if out['type'] == 'logArchive': if do_JL and ((errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent]) or (do_all_error_code)): if errorcode_s in expose_archive_code: expose_archive_code[errorcode_s][agent]-=1 os.system('mkdir -p /tmp/%s'%(os.getenv('USER'))) local = '/tmp/%s/%s'%(os.getenv('USER'),out['lfn'].split('/')[-1]) command = 'xrdcp root://cms-xrd-global.cern.ch/%s %s'%( out['lfn'], local) ## get the file os.system( command ) ## if this actually fail, let's get the file from eos using the new log mapping ## expose the content label=out['lfn'].split('/')[-1].split('.')[0] m_dir = '%s/joblogs/%s/%s/%s/%s'%(monitor_dir, wfn, errorcode_s, task_short, label) os.system('mkdir -p %s'%(m_dir)) os.system('tar zxvf %s -C %s'%(local,m_dir)) ## truncate the content ?? for fn in os.popen('find %s -type f'%(m_dir)).read().split('\n'): if not fn: continue if any([p in fn for p in ['stdout.log']]): trunc = '/tmp/%s/%s'%(os.getenv('USER'), label) #print fn #print trunc head = tail = 1000 os.system('(head -%d ; echo;echo;echo "<snip>";echo;echo ; tail -%d ) < %s > %s'%(head, tail, fn, trunc)) os.system('mv %s %s'%(trunc, fn)) #print task #print json.dumps( total_count, indent=2) #print json.dumps( explanations , indent=2) all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add( site ) if code != '0': all_codes.add( code) ## parse the dashboard data for site in total_by_site_dash: ## no. cannot discriminate by task in dashboard... #all_sites.add( site ) pass ## parse the acdc data notreported='NotReported' all_missing_stats = set() for site in missing_to_run_at[task]: if not missing_to_run_at[task][site]: continue ce = SI.SE_to_CE( site ) #all_sites.add( ce ) all_missing_stats.add( ce ) error_site_count[notreported][ce] = 0 all_codes.add(notreported) ## no error code at that point all_missing_stats = all_missing_stats &set(SI.all_sites) all_not_reported = all_missing_stats - all_sites #print task #print "site with no report",sorted(all_not_reported) #print sorted(all_sites) #print sorted(all_missing_stats) all_sites = all_missing_stats | all_sites all_sites = all_sites & set(SI.all_sites) #success = total_count['0'] #total_jobs = sum(total_count.values()) #print total_jobs,"jobs in total,",success,"successes" #miss = "{:,}".format(missing_to_run[task]) if task in missing_to_run else "N/A" ## show the total s_per_code =defaultdict(int) for site in all_sites: for code in sorted(all_codes): s_per_code[code] += error_site_count[code][site] #no_error = (sum(s_per_code.values())==0) no_error = len(all_not_reported)!=0 if not no_error and notreported in all_codes: all_codes.remove( notreported ) missing_events = missing_to_run[task] if task in missing_to_run else 0 html += "<b>%s</b>"%task.split('/')[-1] if missing_events: html += " is missing <b>%s events</b>"%( "{:,}".format(missing_events) ) if no_error: html +="<br><b><font color=red> and has UNreported error</font></b>" html += "<br><table border=1><thead><tr><th>Sites/Errors</th>" #for site in all_sites: # html+='<th>%s</th>'%site for code in sorted(all_codes): html+='<th><a href="#%s">%s</a>'%(code,code) if str(code) in expose_archive_code or do_all_error_code: html += ' <a href=../joblogs/%s/%s/%s>, JL</a>'%( wfn, code, task_short ) if str(code) in expose_condor_code or do_all_error_code: html += ' <a href=../condorlogs/%s/%s/%s>, CL</a>'%( wfn, code, task_short ) html += '</th>' html+='<th>Total jobs</th><th>Site Ready</th>' html+='</tr></thead>\n' html+='<tr><td>Total</td>' for code in sorted(all_codes): html += '<td bgcolor=orange width=100>%d'%(s_per_code[code]) if code in total_by_code_dash: html += ' (<b><i>%d</i></b>)'% total_by_code_dash[code] html += '</td>' ulist='<ul>' grand=0 for status in sorted(status_per_task[task].keys()): ulist+='<li> %s %d'%( status, status_per_task[task][status]) grand+= status_per_task[task][status] ulist+='<li><b> Total %d </b>'%grand ulist+='</ul>' #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.) html += '<td bgcolor=orange> → %.2f%% ← </td>'% (100.*(float(sum(s_per_code.values()))/ grand) if grand else 0.) html += '<td bgcolor=orange> %s </td>'% ulist html+='</tr>' def palette(frac): _range = { 0.0 : 'green', 0.5 : 'green', 0.6 : 'darkgreen', 0.7 : 'orange', 0.8 : 'salmon', 0.9 : 'red' } which = [k for k in _range.keys() if k<=frac] if which: there = max(which) else: there=max(_range.keys()) return _range[there] for site in sorted(all_sites): site_in = 'Yes' color = 'bgcolor=lightblue' if not site in SI.sites_ready: color = 'bgcolor=indianred' site_in ='<b>No</b>' if missing_to_run_at[task][SI.CE_to_SE(site)] == 0 or min_rank == task_rank: color = 'bgcolor=aquamarine' site_in = '<b>No</b> but fine' if not no_error: site_in +=" (%s events)"%"{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)]) html+='<tr><td %s>%s</td>'%(color,site) for code in sorted(all_codes): if code == notreported: html += '<td %s width=200>%s events </td>' %(color, "{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)])) else: if error_site_count[code][site]: er_frac = float(error_site_count[code][site])/s_per_code[code] if s_per_code[code] else 0. si_frac = float(error_site_count[code][site])/total_per_site[site] if total_per_site[site] else 0. html += '<td %s width=200>%d'%(color, error_site_count[code][site]) if code in r_dashb and site in r_dashb[code]: html += ' (<b><i>%d</i></b>)'%( r_dashb[code][site] ) html += ', <font color=%s>↑ %.1f%%</font>, <font color=%s>→ %.1f%%</font></td>'% ( palette(er_frac),100.*er_frac, palette(si_frac), 100.*si_frac ) else: html += '<td %s>0</td>'% color html += '<td bgcolor=orange>%d</td>'% total_per_site[site] html += '<td %s>%s</td>'% (color, site_in) html +='</tr>\n' html+='</table><br>' task_error_site_count[task] = error_site_count html += '<hr><br>' html += "<b>Blocks (%d/%d) needed for recovery</b><br>"%( len(needed_blocks), len(all_blocks)) for block in sorted(needed_blocks): html +='%s<br>'%block html += "<br><b>Files in no block</b><br>" for f in sorted(files_notin_dbs): html +='%s<br>'%f html += '<hr><br>' html += '<table border=1>' for code in one_explanation: html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' )) #explanations[code].update( one_explanation[code] ) html+='</table>' html+=('<br>'*30) html +='</html>' wfi.sendLog( 'error', html, show=False) fn = '%s'% wfn open('%s/report/%s'%(monitor_dir,fn),'w').write( html ) return task_error_site_count, one_explanation
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def mappor(url, options=None): up = componentInfo(soft=['mcm', 'wtc', 'jira']) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) UC = unifiedConfiguration() over_rides = [] use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True if use_T0: over_rides.append('T0_CH_CERN') use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True if use_HLT: over_rides.append('T2_CH_CERN_HLT') use_CSCS = ('T0_CH_CSCS_HPC' in UC.get("site_for_overflow")) if options.cscs: use_CSCS = True if use_CSCS: over_rides.append('T0_CH_CSCS_HPC') SI = global_SI(over_rides) #print sorted(SI.all_sites) #print sorted(SI.sites_T0s) CI = campaignInfo() #sites_to_consider = SI.all_sites sites_to_consider = SI.sites_ready for site in sites_to_consider: region = site.split('_')[1] if not region in [ 'US', 'DE', 'IT', 'FR', 'CH', 'ES', 'UK', 'RU' ### latest addition ]: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s, m, r, "lacking pressure" return True else: print s, m, r, "pressure" pass return False for site in sites_to_consider: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure within_region = [ fb for fb in sites_to_consider if any([('_%s_' % (reg) in fb and fb != site and site_in_depletion(fb)) for reg in regions[region]]) ] #print site,region, within_region mapping[site] = within_region for site in sites_to_consider: if site.split('_')[1] == 'US': ## to all site in the US ## add NERSC mapping[site].append('T3_US_NERSC') mapping[site].append('T3_US_SDSC') mapping[site].append('T3_US_TACC') mapping[site].append('T3_US_PSC') ## add OSG mapping[site].append('T3_US_OSG') #mapping[site].append('T3_US_Colorado') pass if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: ## who can read from T0 mapping['T2_CH_CERN'].append('T0_CH_CERN') mapping['T1_IT_CNAF'].append('T0_CH_CERN') mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') mapping['T1_DE_KIT'].append('T0_CH_CERN') if use_CSCS: ## analog config to T0: mapping['T2_CH_CERN'].append('T0_CH_CSCS_HPC') mapping['T1_IT_CNAF'].append('T0_CH_CSCS_HPC') mapping['T1_FR_CCIN2P3'].append('T0_CH_CSCS_HPC') mapping['T1_DE_KIT'].append('T0_CH_CSCS_HPC') ## temptatively mapping['T0_CH_CERN'].append('T2_CH_CERN') ## all europ can read from CERN for reg in ['IT', 'DE', 'UK', 'FR', 'BE', 'ES']: mapping['T2_CH_CERN'].extend( [fb for fb in sites_to_consider if '_%s_' % reg in fb]) pass ## all europ T1 among each others europ_t1 = [ site for site in sites_to_consider if site.startswith('T1') and any([reg in site for reg in ['IT', 'DE', 'UK', 'FR', 'ES', 'RU']]) ] #print europ_t1 for one in europ_t1: for two in europ_t1: if one == two: continue mapping[one].append(two) pass ## all EU T1 can read from T0 mapping['T0_CH_CERN'].append(one) mapping['T0_CH_CERN'].append('T1_US_FNAL') #mapping['T1_IT_CNAF'].append( 'T1_US_FNAL' ) #mapping['T1_IT_CNAF'].extend( [site for site in SI.sites_ready if '_US_' in site] ) ## all US can read from CNAF mapping['T1_IT_CNAF'].append('T2_CH_CERN') mapping['T1_DE_KIT'].append('T2_CH_CERN') mapping['T2_CH_CERN'].append('T1_IT_CNAF') mapping['T2_CH_CERN'].append('T1_US_FNAL') mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula') mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula_REHA') for site in sites_to_consider: if '_US_' in site: mapping[site].append('T2_CH_CERN') ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads(eosRead('%s/GQ.json' % monitor_dir)) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) pass take_site_out = UC.get('site_out_of_overflow') for site, fallbacks in mapping.items(): mapping[site] = list(set(fallbacks)) ### mapping is a dictionnary where # key can read from site in values. ### reverserd mapping is a dictionnary where # key can be read by site in values. ## create the reverse mapping for the condor module for site, fallbacks in mapping.items(): if site in take_site_out: print "taking", site, "out of overflow source by unified configuration" mapping.pop(site) continue for fb in fallbacks: if fb == site: ## remove self mapping[site].remove(fb) continue if fb in take_site_out: ## remove those to be removed print "taking", fb, "out of overflow destination by unified configuration" mapping[site].remove(fb) continue if not site in reversed_mapping[fb]: reversed_mapping[fb].append(site) ## write it out and bail cache = cacheInfo() cache.store('overflow_mapping', mapping) cache.store('overflow_reverse_mapping', reversed_mapping) return
def spawn_harvesting(url, wfi , in_full): #SI = siteInfo() SI = global_SI() all_OK = {} requests = [] outputs = wfi.request['OutputDatasets'] if ('EnableHarvesting' in wfi.request and wfi.request['EnableHarvesting']) or ('DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']): if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('screwed up wl cache','%s wl cache is bad'%(wfi.request['RequestName'])) all_OK['fake'] = False return all_OK,requests wfi = workflowInfo(url, wfi.request['RequestName']) dqms = [out for out in outputs if '/DQM' in out] if not all([in_full[dqm_input] for dqm_input in dqms]): wfi.sendLog('closor',"will not be able to assign the harvesting: holding up") for dqm_input in dqms: all_OK[dqm_input] = False ## raise the subscription to high priority sites = set(wfi.request['NonCustodialSites']) for site in sites: res = updateSubscription(url, site, dqm_input, priority='high') print "increased priority",res return all_OK,requests for dqm_input in dqms: ## handle it properly harvesting_schema = { 'Requestor': os.getenv('USER'), 'RequestType' : 'DQMHarvest', 'Group' : 'DATAOPS' } copy_over = [ 'AcquisitionEra', 'ProcessingString', 'DQMUploadUrl', 'CMSSWVersion', 'CouchDBName', 'CouchWorkloadDBName', 'CouchURL', 'DbsUrl', 'inputMode', 'DQMConfigCacheID', 'OpenRunningTimeout', 'ScramArch', 'CMSSWVersion', 'Campaign', 'Memory', #dummy 'SizePerEvent', #dummy 'GlobalTag', #dummy ] for item in copy_over: if item in wfi.request: harvesting_schema[item] = copy.deepcopy(wfi.request[item]) else: print item,"is not in initial schema" harvesting_schema['InputDataset'] = dqm_input harvesting_schema['TimePerEvent'] = 1 harvesting_schema['PrepID'] = 'Harvest-'+wfi.request['PrepID'] if len(wfi.request['RequestString'])>60: wfi.request['RequestString']= wfi.request['RequestString'][:60] print "truncating request string",wfi.request['RequestString'] harvesting_schema['RequestString'] = 'HARVEST-'+wfi.request['RequestString'] harvesting_schema['DQMHarvestUnit'] = 'byRun' harvesting_schema['ConfigCacheUrl'] = harvesting_schema['CouchURL'] ## uhm, how stupid is that ? harvesting_schema['RequestPriority'] = wfi.request['RequestPriority']*10 harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema) if not harvest_request: print "Error in making harvesting for",wfi.request['RequestName'] print "schema" print json.dumps( harvesting_schema, indent = 2) harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema) if not harvest_request: print "Error twice in harvesting for",wfi.request['RequestName'] print "schema" print json.dumps( harvesting_schema, indent = 2) if harvest_request: requests.append( harvest_request ) ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely data = reqMgrClient.setWorkflowApproved(url, harvest_request) print "created",harvest_request,"for harvesting of",dqm_input wfi.sendLog('closor',"created %s for harvesting of %s"%( harvest_request, dqm_input)) ## assign it directly team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : [SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites']], 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], 'execute' : True } if in_full[dqm_input]: print "using full copy at",in_full[dqm_input] parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]] else: print "cannot do anything if not having a full copy somewhere" all_OK[dqm_input]=False continue result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters) if not result: #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName'])) sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), level='critical') else: #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog('closor','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName'])) else: #print "could not make the harvesting for",wfo.name,"not announcing" wfi.sendLog('closor',"could not make the harvesting request") sendLog('closor',"could not make the harvesting request for %s"% wfi.request['RequestName'], level='critical') all_OK[dqm_input]=False return (all_OK, requests)
def parse_one(url, wfn, options=None): def time_point(label="", sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "[showError] Time check (%s) point at : %s" % (label, nows) print "[showError] Since start: %s [s]" % (now - time_point.start) if sub_lap: print "[showError] Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "[showError] Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) task_error_site_count = {} one_explanation = defaultdict(set) per_task_explanation = defaultdict(set) if wfn in [ 'vlimant_task_EXO-RunIISummer15wmLHEGS-04800__v1_T_170906_141738_1357' ]: return task_error_site_count, one_explanation time_point("Starting with %s" % wfn) threads = [] SI = global_SI() UC = unifiedConfiguration() wfi = workflowInfo(url, wfn) time_point("wfi", sub_lap=True) where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo() time_point("acdcinfo", sub_lap=True) all_blocks, needed_blocks_loc, files_in_blocks, files_and_loc_notin_dbs = wfi.getRecoveryBlocks( ) time_point("inputs", sub_lap=True) ancestor = workflowInfo(url, wfn) lhe, prim, _, sec = ancestor.getIO() high_order_acdc = 0 while ancestor.request['RequestType'] == 'Resubmission': ancestor = workflowInfo(url, ancestor.request['OriginalRequestName']) lhe, prim, _, sec = ancestor.getIO() high_order_acdc += 1 no_input = (not lhe) and len(prim) == 0 and len(sec) == 0 cache = options.cache print "cache timeout", cache err = wfi.getWMErrors(cache=cache) time_point("wmerrors", sub_lap=True) stat = wfi.getWMStats(cache=cache) time_point("wmstats", sub_lap=True) #adcd = wfi.getRecoveryDoc() total_by_code_dash = defaultdict(int) total_by_site_dash = defaultdict(int) r_dashb = defaultdict(lambda: defaultdict(int)) dash_board_h = 1 if False: ## NB get the since from when the wf has started, not a fixed value ## no dashboard until we get a better api #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache) dashb = {} #print json.dumps( dashb , indent=2) for site, sinfo in dashb.items(): for s_code, counts in sinfo.items(): d_statuses = ['submitted', 'pending', 'app-unknown', 'done'] total_by_code_dash[str(s_code)] += counts.get('submitted', 0) total_by_site_dash[site] += counts.get('submitted', 0) r_dashb[str(s_code)][site] += counts.get('submitted', 0) print json.dumps(total_by_code_dash, indent=2) print json.dumps(total_by_site_dash, indent=2) time_point("Got most input") status_per_task = defaultdict(lambda: defaultdict(int)) if not 'AgentJobInfo' in stat: stat['AgentJobInfo'] = {} #print "bad countent ?" #print json.dumps( stat, indent=2) for agent in stat['AgentJobInfo']: for task in stat['AgentJobInfo'][agent]['tasks']: if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']: info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][ status] #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] if type(info) == dict: status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'] [status].values()) else: status_per_task[task][status] += stat['AgentJobInfo'][ agent]['tasks'][task]['status'][status] #print json.dumps( status_per_task, indent=2) db_total_per_site = defaultdict(int) db_total_per_code = defaultdict(int) ## cannot do that since there is no task count in dashboard and we have to take away the submitted #for site in dashb: # for error in dashb[site]: # db_total_per_site[site] += dashb[site][error] # db_total_per_code[code] += dashb[site][error] print "ACDC Information" print "\t where to re-run" print json.dumps(where_to_run, indent=2) print "\t Missing events" print json.dumps(missing_to_run, indent=2) print "\t Missing events per site" print json.dumps(missing_to_run_at, indent=2) if not where_to_run and not missing_to_run and not missing_to_run_at: print "showError is unable to run" #return task_error_site_count, one_explanation pass do_JL = not options.no_JL do_CL = not options.no_CL do_all_error_code = options.all_errors if high_order_acdc >= 1: print high_order_acdc, "order request, pulling down all logs" do_all_error_code = True if wfi.isRelval(): print "getting all codes for relval" do_all_error_code = True tasks = sorted(set(err.keys() + missing_to_run.keys())) if not tasks: print "no task to look at" #return task_error_site_count html = "<html> <center><h1>%s, Updated on %s (GMT)" % ( wfn, time.asctime(time.gmtime())) html += '</center>' html += '<a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>dts</a>, ' % ( wfn) html += '<a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>ac</a>, ' % ( wfi.request['PrepID']) html += '<a href=https://cms-gwmsmon.cern.ch/prodview/%s>Job Progress</a>, ' % ( wfn) r_type = wfi.request.get('OriginalRequestType', wfi.request.get('RequestType', 'NaT')) if r_type in ['ReReco']: html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a>, ' % wfi.request[ 'PrepID'] html += '<a href="https://its.cern.ch/jira/issues/?jql=text~%s AND project = CMSCOMPPR" target="_blank">jira</a>' % ( wfi.request['PrepID']) html += '<hr>' html += '<a href=#IO>I/O</a>, <a href=#ERROR>Errors</a>, <a href=#BLOCK>blocks</a>, <a href=#FILE>files</a>, <a href=#CODES>Error codes</a><br>' html += '<hr>' time_point("Header writen") html += '<a name=IO></a>' if prim: html += 'Reads in primary<br>' rwl = wfi.getRunWhiteList() lwl = wfi.getLumiWhiteList() for dataset in prim: html += '<b>%s </b>(events/lumi ~%d)' % ( dataset, getDatasetEventsPerLumi(dataset)) blocks = getDatasetBlocks(dataset, runs=rwl) if rwl else None blocks = getDatasetBlocks(dataset, lumis=lwl) if lwl else None available = getDatasetBlocksFraction(url, dataset, only_blocks=blocks) html += '<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>' % available html += '<ul>' presence = getDatasetPresence(url, dataset, only_blocks=blocks) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul><br>' if sec: html += 'Reads in secondary<br>' for dataset in sec: presence = getDatasetPresence(url, dataset) html += '<b>%s</b><ul>' % dataset for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' outs = sorted(wfi.request['OutputDatasets']) if outs: html += 'Produces<br>' for dataset in outs: presence = getDatasetPresence(url, dataset) html += '<b>%s </b>(events/lumi ~ %d)<ul>' % ( dataset, getDatasetEventsPerLumi(dataset)) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' time_point("Input checked") html += """ <hr><br> <a name=ERROR></a> <ul> <li> <b><i>dashboard numbers over %d days</b></i> <li> ↑ %% with respect to total number of error in the code <li> → %% with respect to total number of error at the site </ul> """ % (dash_board_h) html += '<br>' n_expose_base = options.expose # if options else UC.get('n_error_exposed') print "getting", n_expose_base, "logs by default" if tasks: min_rank = min([task.count('/') for task in tasks]) for task in tasks: n_expose = n_expose_base expose_archive_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_archive_code')]) expose_condor_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_condor_code')]) #print task task_rank = task.count('/') task_short = task.split('/')[-1] total_per_site = defaultdict(int) time_point("Starting with task %s" % task_short, sub_lap=True) notreported = 'NotReported' total_count = defaultdict(int) error_site_count = defaultdict(lambda: defaultdict(int)) all_not_reported = set() for agent in stat['AgentJobInfo']: for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'skipped', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['skipped'][ site] #print info all_not_reported.add(site) ce = SI.SE_to_CE(site) error_site_count[notreported][ce] += info.get( 'skippedFiles', 0) total_count[notreported] += info.get('skippedFiles', 0) for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'sites', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][ site] for s in ['success', 'failure', 'cooloff', 'submitted']: if not s in info: continue data = info[s] if type(data) == dict: total_per_site[site] += sum(data.values()) else: total_per_site[site] += data #is the task relevant to recover (discard log, cleanup) if any([v in task.lower() for v in ['logcol', 'cleanup']]): continue #total_count= defaultdict(int) #error_site_count = defaultdict( lambda : defaultdict(int)) if not task in err: print task, "has not reported error" err[task] = {} #print err[task].keys() for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count error_site_count[errorcode_s][ce] += count ## show the total all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) s_per_code = defaultdict(int) for site in all_sites: for code in sorted(all_codes): s_per_code[code] += error_site_count[code][site] expose_top_N = UC.get('expose_top_N') count_top_N = min( sorted(s_per_code.values(), reverse=True)[:expose_top_N]) if s_per_code else -1 for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() force_code = (count_top_N > 0 and s_per_code[errorcode_s] >= count_top_N) if force_code: print "will expose", errorcode_s, "anyways" for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] ###total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count ###error_site_count[errorcode_s][ce] += count for sample in err[task][exittype][errorcode_s][site][ 'samples']: #print sample.keys() for step in sample['errors']: for report in sample['errors'][step]: if report['type'] == 'CMSExeption': continue #if int(report['exitCode']) == int(errorcode_s): one_explanation[errorcode_s].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) per_task_explanation[ "%s:%s" % (task_short, errorcode_s)].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) #one_explanation[errorcode_s].add( report['details'] ) #else: #one_explanation[ agent = sample['agent_name'] wmbs = sample['wmbsid'] workflow = sample['workflow'] if force_code: if not errorcode_s in expose_condor_code: expose_condor_code[errorcode_s] = defaultdict( lambda: n_expose) if not errorcode_s in expose_archive_code: expose_archive_code[errorcode_s] = defaultdict( lambda: n_expose) if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) ) and 'cern' in agent: if errorcode_s in expose_condor_code: expose_condor_code[errorcode_s][agent] -= 1 print errorcode_s, agent, "error count", expose_condor_code.get( errorcode_s, {}).get(agent, 0) threads.append( AgentBuster(agent=agent, workflow=workflow, wmbs=wmbs, errorcode_s=errorcode_s, base_eos_dir=base_eos_dir, monitor_eos_dir=monitor_eos_dir, task_short=task_short)) for out in sample['output']: #print out if out['type'] == 'logArchive': if do_JL and ( (errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent] > 0)): if errorcode_s in expose_archive_code: expose_archive_code[errorcode_s][ agent] -= 1 print errorcode_s, agent, "error count", expose_archive_code.get( errorcode_s, {}).get(agent, 0) threads.append( XRDBuster( out_lfn=out['lfn'], monitor_eos_dir=monitor_eos_dir, wfn=wfn, errorcode_s=errorcode_s, task_short=task_short, from_eos=( not options.not_from_eos ), # if options else True), )) #print task #print json.dumps( total_count, indent=2) #print json.dumps( explanations , indent=2) all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) ## parse the dashboard data for site in total_by_site_dash: ## no. cannot discriminate by task in dashboard... #all_sites.add( site ) pass ## parse the acdc data #notreported='NotReported' #all_missing_stats = set() #for site in missing_to_run_at[task] if task in missing_to_run_at else []: # if not missing_to_run_at[task][site]: continue # ce = SI.SE_to_CE( site ) # #all_sites.add( ce ) # all_missing_stats.add( ce ) #all_missing_stats = all_missing_stats &set(SI.all_sites) #all_not_reported = all_missing_stats - all_sites #print task #print "site with no report",sorted(all_not_reported) #print sorted(all_sites) #print sorted(all_missing_stats) #all_sites = all_missing_stats | all_sites #all_sites = all_sites & set(SI.all_sites) no_error = len(all_not_reported) != 0 if not no_error and notreported in all_codes: all_codes.remove(notreported) missing_events = missing_to_run[task] if task in missing_to_run else 0 feff = wfi.getFilterEfficiency(task.split('/')[-1]) html += "<a name=%s>" % task.split('/')[-1] html += "<b>%s</b>" % task.split('/')[-1] if missing_events: if feff != 1.: html += ' is missing %s events in input and <b>about %s events in output</b>' % ( "{:,}".format(missing_events), "{:,}".format( int(missing_events * feff))) else: html += ' is missing <b>%s events in I/O</b>' % ( "{:,}".format(missing_events)) html += ' <a href="https://cmsweb.cern.ch/couchdb/acdcserver/_design/ACDC/_view/byCollectionName?key=%%22%s%%22&include_docs=true&reduce=false" target=_blank>AC/DC</a>' % ( wfn) if no_error: html += "<br><b><font color=red> and has UNreported error</font></b>" html += "<br><table border=1><thead><tr><th>Sites/Errors</th>" #for site in all_sites: # html+='<th>%s</th>'%site for code in sorted(all_codes): #html+='<th><a href="#%s">%s</a>'%(code,code) html += '<th><a href="#%s:%s">%s</a>' % (task_short, code, code) if (str(code) in expose_archive_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/joblogs/%s/%s/%s>, JobLog</a>' % ( unified_url_eos, wfn, code, task_short) if (str(code) in expose_condor_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/condorlogs/%s/%s/%s>, CondorLog</a>' % ( unified_url_eos, wfn, code, task_short) html += '</th>' html += '<th>Total jobs</th><th>Site Ready</th>' html += '</tr></thead>\n' html += '<tr><td>Total</td>' for code in sorted(all_codes): html += '<td bgcolor=orange width=100>%d' % (s_per_code[code]) if code in total_by_code_dash: html += ' (<b><i>%d</i></b>)' % total_by_code_dash[code] html += '</td>' ulist = '<ul>' grand = 0 for status in sorted(status_per_task[task].keys()): ulist += '<li> %s %d' % (status, status_per_task[task][status]) grand += status_per_task[task][status] ulist += '<li><b> Total %d </b>' % grand ulist += '</ul>' #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.) html += '<td bgcolor=orange> → %.2f%% ← </td>' % ( 100. * (float(sum(s_per_code.values())) / grand) if grand else 0.) html += '<td bgcolor=orange> %s </td>' % ulist html += '</tr>' def palette(frac): _range = { 0.0: 'green', 0.5: 'green', 0.6: 'darkgreen', 0.7: 'orange', 0.8: 'salmon', 0.9: 'red' } which = [k for k in _range.keys() if k <= frac] if which: there = max(which) else: there = max(_range.keys()) return _range[there] for site in sorted(all_sites): site_in = 'Yes' color = 'bgcolor=lightblue' if not site in SI.sites_ready: color = 'bgcolor=indianred' site_in = '<b>No</b>' if task in missing_to_run_at and missing_to_run_at[task][ SI.CE_to_SE(site)] == 0 or min_rank == task_rank: color = 'bgcolor=aquamarine' site_in = '<b>No</b> but fine' if not no_error: site_in += " (%s events)" % ("{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)]) if task in missing_to_run_at else '--') html += '<tr><td %s>%s</td>' % (color, site) for code in sorted(all_codes): if code == notreported: html += '<td %s width=200>%s events </td>' % ( color, "{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)])) else: if error_site_count[code][site]: er_frac = float( error_site_count[code][site] ) / s_per_code[code] if s_per_code[code] else 0. si_frac = float( error_site_count[code][site]) / total_per_site[ site] if total_per_site[site] else 0. html += '<td %s width=200>%d' % ( color, error_site_count[code][site]) if code in r_dashb and site in r_dashb[code]: html += ' (<b><i>%d</i></b>)' % ( r_dashb[code][site]) html += ', <font color=%s>↑ %.1f%%</font>, <font color=%s>→ %.1f%%</font></td>' % ( palette(er_frac), 100. * er_frac, palette(si_frac), 100. * si_frac) else: html += '<td %s>0</td>' % color html += '<td bgcolor=orange>%d</td>' % total_per_site[site] html += '<td %s>%s</td>' % (color, site_in) html += '</tr>\n' html += '</table><br>' task_error_site_count[task] = error_site_count ## run all retrieval run_threads = ThreadHandler( threads=threads, n_threads=options.log_threads, # if options else 5, sleepy=10, timeout=UC.get('retrieve_errors_timeout'), verbose=True) run_threads.start() html += '<hr><br>' html += '<a name=BLOCK></a>' html += "<b>Blocks (%d/%d) needed for recovery</b><br>" % ( len(needed_blocks_loc), len(all_blocks)) for block in sorted(needed_blocks_loc.keys()): html += '%s <b>@ %s</b><br>' % (block, ','.join( sorted(needed_blocks_loc[block]))) html += '<a name=FILE></a>' html += "<br><b>Files in no block</b><br>" rthreads = [] check_files = [f for f in files_and_loc_notin_dbs.keys() if '/store' in f] random.shuffle(check_files) check_files = check_files[:100] check_files = [] ## disable it completely by_f = {} if check_files: for f in check_files: rthreads.append(ReadBuster(file=f)) print "checking on existence of", len(rthreads), "files" run_rthreads = ThreadHandler(threads=rthreads, n_threads=20, timeout=10) run_rthreads.start() while run_rthreads.is_alive(): time.sleep(10) for t in run_rthreads.threads: by_f[t.file] = t.readable #print "checked",t.file,t.readable for f in sorted(files_and_loc_notin_dbs.keys()): readable = by_f.get(f, -1) if readable == -1: fs = '%s' % f elif readable == 0: fs = '<font color=light green>%s</font>' % f #print f,"is readable" else: fs = '<font color=red>%s</font>' % f #print f,"is not readable" html += '%s <b>@</b> %s<br>' % (fs, ','.join( sorted(files_and_loc_notin_dbs[f]))) #html +='%s <b>@</b> %s<br>'%(f , ','.join(sorted(files_and_loc_notin_dbs[f])) ) html += '<hr><br>' html += '<a name=CODES></a>' html += '<table border=1>' for code in per_task_explanation: html += '<tr><td><a name="%s">%s</a><br><a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes>code twiki</a></td><td>%s</td></tr>' % ( code, code, '<br><br>'.join(per_task_explanation[code]).replace( '\n', '<br>')) #for code in one_explanation: # html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' )) html += '</table>' html += ('<br>' * 30) html += '</html>' time_point("Report finished") wfi.sendLog('error', html, show=False) fn = '%s' % wfn time_point("error send to ES") open('%s/report/%s' % (monitor_dir, fn), 'w').write(html) open('%s/report/%s' % (monitor_eos_dir, fn), 'w').write(html) time_point("Finished with showError") ## then wait for the retrivals to complete ping = 0 while run_threads.is_alive(): ping += 1 if ping % 100: time_point("waiting for sub-threads to finish") time.sleep(6) time_point("Finished with retrieval threads") return task_error_site_count, one_explanation