def stagor(url, specific=None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() TS = transferStatuses() cached_transfer_statuses = TS.content() transfer_statuses = {} done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads( eosRead('%s/lost_blocks_datasets.json' % monitor_dir)) lost_files = json.loads( eosRead('%s/lost_files_datasets.json' % monitor_dir)) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys() + lost_files.keys()): b, f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset, "has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset, "has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] def time_point(label="", sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s" % (label, nows) print "Since start: %s [s]" % (now - time_point.start) if sub_lap: print "Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) time_point("Check cached transfer") ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) needs_by_priority = defaultdict(list) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in [ 'running-open', 'running-closed', 'completed', 'assigned', 'acquired' ]: wfi.sendLog('stagor', "is in status %s" % wfi.request['RequestStatus']) wfo.status = 'away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too ## that usually happens for relvals if wfi.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfi.isRelval(): wfo.status = 'forget' session.commit() continue else: print wfo.name, "is", wfi.request['RequestStatus'] #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) sendLog("stagor", "%s is in %s, set away" % (wfo.name, wfi.request['RequestStatus']), level='critical') wfo.status = 'away' session.commit() continue wfois.append((wfo, wfi)) _, primaries, _, secondaries = wfi.getIO() for dataset in list(primaries) + list(secondaries): needs[wfo.name].append(dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} needs_by_priority[wfi.request['RequestPriority']].append(dataset) wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset)) time_point("Check staging workflows") open('%s/dataset_requirements.json' % monitor_dir, 'w').write(json.dumps(needs, indent=2)) for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio])) open('%s/dataset_priorities.json' % monitor_dir, 'w').write(json.dumps(needs_by_priority, indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## first check if anything is inactive all_actives = set([ transfer.phedexid for transfer in session.query(TransferImp).filter( TransferImp.active).all() ]) for active_phedexid in all_actives: skip = True transfers_phedexid = session.query(TransferImp).filter( TransferImp.phedexid == active_phedexid).all() for imp in transfers_phedexid: if imp.workflow.status == 'staging': skip = False sendLog( 'stagor', "\t%s is staging for %s" % (imp.phedexid, imp.workflow.name)) if skip: sendLog('stagor', "setting %s inactive" % active_phedexid) for imp in transfers_phedexid: imp.active = False session.commit() all_actives = sorted( set([ transfer.phedexid for transfer in session.query( TransferImp).filter(TransferImp.active).all() ])) for phedexid in all_actives: if specific: continue ## check on transfer completion not_cached = False if phedexid in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor', "read %s from cache" % phedexid) checks = cached_transfer_statuses[phedexid] else: ## I actually would like to avoid that all I can sendLog('stagor', 'Performing spurious transfer check on %s' % phedexid, level='critical') checks = checkTransferStatus(url, phedexid, nocollapse=True) try: print json.dumps(checks, indent=2) except: print checks if not checks: ## this is going to bias quite heavily the rest of the code. we should abort here #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical') #return False sendLog( 'stagor', 'Stagor has got a skewed input from checkTransferStatus', level='critical') checks = {} pass else: TS.add(phedexid, checks) time_point("Check transfer status %s" % phedexid, sub_lap=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname] = {} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][phedexid] = all( map(lambda i: i >= good_enough, checks[dsname].values())) completion_by_input[dsname][phedexid] = checks[dsname].values() if checks: sendLog( 'stagor', "Checks for %s are %s" % (phedexid, [node.values() for node in checks.values()])) done = all( map( lambda i: i >= good_enough, list( itertools.chain.from_iterable( [node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? if not_cached: print "Transfer status was not cached" else: print "ERROR with the scubscriptions API of ", phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False transfers_phedexid = session.query(TransferImp).filter( TransferImp.phedexid == phedexid).all() for imp in transfers_phedexid: tr_wf = imp.workflow if tr_wf: # and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {} done_by_wf_id[tr_wf.id][phedexid] = done if done: imp.active = False session.commit() for ds in checks: for s, v in checks[ds].items(): dataset_endpoints[ds].add(s) if done: sendLog('stagor', "%s is done" % phedexid) TS.add(phedexid, checks) else: sendLog( 'stagor', "%s is not finished %s" % (phedexid, pprint.pformat(checks))) ##pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [ SI.SE_to_CE(s) for s, v in checks[ds].items() if v < good_enough ] sites_incomplete_down = [ s for s in sites_incomplete if not s in SI.sites_ready ] ## no space means no transfer should go there : NO, it does not work in the long run #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))] if sites_incomplete_down: sendLog( 'stagor', "%s are in downtime, while waiting for %s to get there" % (",".join(sites_incomplete_down), ds)) endpoint_in_downtime[ds].update(sites_incomplete_down) if sites_incomplete: endpoint_incompleted[ds].update(sites_incomplete) time_point("Check on-going transfers") print "End points" for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps(dataset_endpoints, indent=2) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) print json.dumps(endpoint_in_downtime, indent=2) print "End point incomplete in down time" for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k]) print json.dumps(endpoint_incompleted, indent=2) #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) eosFile('%s/transfer_statuses.json' % monitor_dir, 'w').write(json.dumps(TS.content(), indent=2)).close() eosFile('%s/dataset_endpoints.json' % monitor_dir, 'w').write(json.dumps(dataset_endpoints, indent=2)).close() already_stuck = json.loads( eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys() already_stuck.extend(getAllStuckDataset()) missing_in_action = defaultdict(list) print "-" * 10, "Checking on workflows in staging", "-" * 10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda: defaultdict(float)) presence_cache = defaultdict(dict) time_point("Preparing for more") for wfo, wfi in wfois: print "#" * 30 time_point("Forward checking %s" % wfo.name, sub_lap=True) ## the site white list takes site, campaign, memory and core information (_, primaries, _, secondaries, sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys = {} for need in list(primaries) + list(secondaries): if not need in done_by_input: wfi.sendLog('stagor', "missing transfer report for %s" % need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report', '%s does not have a transfer report' % (need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog( 'stagor', "assuming it is OK for secondary %s to have no attached transfers" % need) readys[need] = True done_by_input[need] = {"fake": True} continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor', "%s is ready" % need) print json.dumps(done_by_input[need], indent=2) readys[need] = True else: wfi.sendLog( 'stagor', "%s is not ready \n%s" % (need, json.dumps(done_by_input[need], indent=2))) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor', "all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor', "all needs are fullfilled, already") print json.dumps(readys, indent=2) else: wfi.sendLog('stagor', "missing requirements") copies_needed, _ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries): #+list(secondaries): if endpoint_in_downtime[need] and endpoint_in_downtime[ need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog( 'stagor', "%s has only incomplete endpoint in downtime\n%s" % (need, endpoint_in_downtime[need])) re_transfer = True if not se_allowed_key in available_cache[need]: available_cache[need][ se_allowed_key] = getDatasetBlocksFraction( url, need, sites=se_allowed) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog( 'stagor', "assuming it is OK to move on like this already for %s" % need) jump_ahead = True else: wfi.sendLog( 'stagor', "Available %s times" % available_cache[need][se_allowed_key]) missing_and_downtime = list( set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need])) if missing_and_downtime: wfi.sendLog( 'stagor', "%s is incomplete at %s which is in downtime, trying to move along" % (need, ','.join(missing_and_downtime))) jump_ahead = True else: wfi.sendLog( 'stagor', "continue waiting for transfers for optimum production performance." ) ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries) + list(secondaries): if need in already_stuck: wfi.sendLog('stagor', "%s is stuck, so try to jump ahead" % need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead" % wfo.name details_text += '\n%s wants %s copies' % (wfo.name, copies_needed) copies_needed = max(1, copies_needed - 1) details_text += '\nlowering by one unit to %s' % copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][ se_allowed_key] = getDatasetPresence( url, need, within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update(presence.keys()) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog( 'stagor', "%s is available %s times (%s), at %s" % (need, available, this_check, se_allowed_key)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point this_check = all(done_by_input[need].values()) wfi.sendLog( 'stagor', "%s is this much transfered %s" % (need, json.dumps(done_by_input[need], indent=2))) all_check &= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check and not re_transfer: wfi.sendLog( 'stagor', "needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name, "has to wait a bit more" wfi.sendLog('stagor', "needs to wait a bit more") else: wfi.sendLog('stagor', "not checking availability") if re_transfer: wfi.sendLog( 'stagor', "Sending back to considered because of endpoint in downtime" ) if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add(wfo.name) time_point("Checked affected workflows") if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s" % ('\n'.join(send_back_to_considered)), level='critical') print "-" * 10, "Checking on non-available datasets", "-" * 10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min(available_cache[dsname].values()) really_stuck_dataset = set() for dsname, available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter( Workflow.name == using_it).first() if wf: using_wfos.append(wf) if not len(done_by_input[dsname]): print "For dataset", dsname, "there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending", wf.name, "back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble" % wf.name, level='critical') else: print "would send", wf.name, "back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog( 'stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good." % wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete", dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks, lost_files = findLostBlocksFiles( url, dsname) if (not dsname.endswith('/RAW')) else ([], []) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss, _, n_missing = getDatasetBlockFraction( dsname, lost_block_names) print "We have lost", len( lost_block_names ), "blocks", lost_block_names, "for %f%%" % (100. * fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog( 'stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss' % (dsname, len(lost_block_names), n_missing, 100 * fraction_loss), level='critical') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name, "is doomed. setting to trouble" wf.status = 'trouble' session.commit() sendLog( 'stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss' % (wf.name, dsname, len(lost_block_names), n_missing, 100 * fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog( 'stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s' % (dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join(lost_block_names)), level='critical') known_lost_blocks[dsname] = [ i['name'] for i in lost_blocks ] really_stuck_dataset.add(dsname) if lost_files: fraction_loss, _, n_missing = getDatasetFileFraction( dsname, lost_file_names) print "We have lost", len( lost_file_names ), "files", lost_file_names, "for %f%%" % fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog( 'stagor', '%s is missing %d files, for %d events, %f %% loss' % (dsname, len(lost_file_names), n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name, "is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog( 'stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s' % (dsname, len(lost_file_names), n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [ i['name'] for i in lost_files ] ## should the status be change to held-staging and pending on a ticket missings = [ pid for (pid, d) in done_by_input[dsname].items() if d == False ] print "\t", done_by_input[dsname] print "\tneeds", len(done_by_input[dsname]) print "\tgot", done_by_input[dsname].values().count(True) print "\tmissing", missings missing_in_action[dsname].extend(missings) rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w') rr.write(json.dumps(known_lost_blocks, indent=2)) rr.close() rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w') rr.write(json.dumps(known_lost_files, indent=2)) rr.close() eosFile('%s/incomplete_transfers.json' % monitor_dir, 'w').write(json.dumps(missing_in_action, indent=2)).close() print "Stuck transfers and datasets" print json.dumps(missing_in_action, indent=2) TD = transferDataset() datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add(dataset) for k in datasets_by_phid: #datasets_by_phid[k] = list(datasets_by_phid[k]) TD.add(k, list(datasets_by_phid[k])) #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close() eosFile('%s/really_stuck_dataset.json' % base_eos_dir, 'w').write(json.dumps(list(really_stuck_dataset), indent=2)).close() print '\n' * 2, "Datasets really stuck" print '\n'.join(really_stuck_dataset) ############# ## not going further for what matters ############# return
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost = json.loads(open('lost_blocks_datasets.json').read()) still_lost = [] for dataset in lost: l = findLostBlocks(url ,dataset) if not l: print dataset,"is not really lost" else: still_lost.append( dataset ) open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) ) if options.fast: print "doing the fast check of staged with threshold:",options.goodavailability for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): if specific and not specific in wfo.name: continue wfi = workflowInfo(url, wfo.name) sites_allowed = getSiteWhiteList( wfi.getIO() ) if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']): sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist'])) _,primaries,_,secondaries = wfi.getIO() se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] all_check = True for dataset in list(primaries):#+list(secondaries) ? #print se_allowed available = getDatasetBlocksFraction( url , dataset , sites=se_allowed ) all_check &= (available >= options.goodavailability) if not all_check: break if all_check: print "\t\t",wfo.name,"can go staged" wfo.status = 'staged' session.commit() else: print "\t",wfo.name,"can wait a bit more" return for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): done_by_input[dataset] = {} completion_by_input[dataset] = {} print wfo.name,"needs",dataset for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': print "\t",transfer.phedexid,"is staging for",tr_wf.name skip=False if skip: continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done if done: ## transfer.status = 'done' print transfer.phedexid,"is done" else: print transfer.phedexid,"not finished" pprint.pprint( checks ) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) else: print "would send",wf.name,"back to considered" sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) continue #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) #if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it print "presence of",dsname,"does not matter anymore" print "\t",done_by_input[dsname] print "\t",[wf.status for wf in using_wfos] print "\tneeds",need_sites continue #?? ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print "incomplete",dsname lost = findLostBlocks(url, dsname) try: known_lost = json.loads(open('lost_blocks_datasets.json').read()) except: print "enable to get the known_lost from local json file" known_lost = [] if lost and not dsname in known_lost: lost_names = [item['name'] for item in lost] ## make a deeper investigation of the block location to see whether it's really no-where no-where print "We have lost",len(lost),"blocks",lost_names #print json.dumps( lost , indent=2 ) sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 )) known_lost.append(dsname) rr= open('lost_blocks_datasets.json','w') rr.write( json.dumps( known_lost, indent=2)) rr.close() ## should the status be change to held-staging and pending on a ticket print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass
def assignor(url, specific=None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos = [] if specific: wfos = session.query(Workflow).filter(Workflow.name == specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter( Workflow.status == 'considered').all() wfos.extend( session.query(Workflow).filter( Workflow.status == 'staging').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name, "to be assigned" wfh = workflowInfo(url, wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: print "No go for", wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': print wfo.name, wfh.request['RequestStatus'], "skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput, primary, parent, secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) print "Allowed", sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial) == 0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial) > 1: print "more than one custodial for", wfo.name sys.exit(36) secondary_locations = None for sec in list(secondary): presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if there ] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [ site for site in sites_allowed if any([ osite.startswith(site) for osite in one_secondary_locations ]) ] sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence(url, prim) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed]) sites_all_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, (there, frac)) in presence.items() if there ] ]) ] sites_with_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, frac) in presence.items() if frac[1] > 90. ] ]) ] sites_with_any_data = [ site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()]) ] if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed)) ] print "We could be running at", opportunistic_sites, "in addition" if available_fractions and not all( [available >= 1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): print "The input dataset is not available", copies_wanted, "times, only", available_fractions.values( ) if not options.go: continue ## default back to white list to original white list with any data print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected for any data", sites_allowed if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name, "cannot be assign with no matched sites" continue parameters = { 'SiteWhitelist': sites_allowed, 'CustodialSites': sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion': version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team = 'production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign", wfo.name else: pass
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) #if specific: # #wfos = session.query(Workflow).filter(Workflow.name==specific).all() # wfos = session.query(Workflow).filter(Workflow.name.contains(specific)).all() #if not wfos: # if specific: # wfos = session.query(Workflow).filter(Workflow.status=='considered').all() # wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) # wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: wfh.sendLog('assignor', "No go for %s" % wfh.request['Campaign']) n_stalled += 1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) sendEmail( "cannot be assigned due to downtime", "%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered." % wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status continue ## default back to white list to original white list with any data print "Allowed", sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendEmail("cannot be assigned", "%s has no whitelist" % (wfo.name)) n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") sendEmail( "Fallback to EventBased", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name) elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") sendEmail( "Modifying the job per events", "the workflow %s is too heavy in number of jobs explosion" % wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: sendEmail( "issue with event splitting for run-dependent MC", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: sendEmail( "setting lumi splitting for run-dependent MC", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: sendEmail("leaving splitting untouched for PU_RD*", "please check on " + wfo.name) wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
print sorted( prim_to_distribute ) copies_needed=2 print "will make",copies_needed,"of all" read_file = 'legacy-reco-raw_prio.txt' for l in open(read_file).read().split('\n'): if not l: continue if l.startswith('#'): continue all_ds = filter(lambda w : w.count('/')==3 and w.startswith('/'), l.split()) dss.update( all_ds ) print sorted(dss) for ds in dss: availability = getDatasetBlocksFraction(url, ds ) if availability>=1: continue blocks= [] chops,sizes = getDatasetChops(ds, chop_threshold = 4000, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) print json.dumps(spreading, indent=2) for site,items in spreading.items(): full_spread[site].update( items ) to_lock.add( ds ) addHocLocks = set(json.loads( open('addhoc_lock.json').read())) addHocLocks.update( to_lock ) if act: open('addhoc_lock.json','w').write( json.dumps( sorted(addHocLocks), indent=2 ) )
def parse_one(url, wfn, options=None): SI = global_SI() wfi = workflowInfo( url , wfn) where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() all_blocks,needed_blocks,files_in_blocks,files_notin_dbs = wfi.getRecoveryBlocks() ancestor = workflowInfo( url , wfn) lhe,prim,_,sec = ancestor.getIO() high_order_acdc = 0 while ancestor.request['RequestType'] == 'Resubmission': ancestor = workflowInfo(url, ancestor.request['OriginalRequestName']) lhe,prim,_,sec = ancestor.getIO() high_order_acdc += 1 no_input = (not lhe) and len(prim)==0 and len(sec)==0 cache = 0 if options: cache = options.cache print "cache timeout", cache err= wfi.getWMErrors(cache=cache) stat = wfi.getWMStats(cache=cache) #adcd = wfi.getRecoveryDoc() total_by_code_dash = defaultdict( int ) total_by_site_dash = defaultdict( int ) r_dashb =defaultdict( lambda : defaultdict( int )) dash_board_h = 1 if True :#'pdmvserv_TOP-RunIISummer15wmLHEGS-00103_00183_v0__161005_165048_809' in wfn: ## NB get the since from when the wf has started, not a fixed value ## no dashboard until we get a better api #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache) dashb = {} #print json.dumps( dashb , indent=2) for site,sinfo in dashb.items(): for s_code,counts in sinfo.items(): d_statuses = ['submitted','pending','app-unknown','done'] total_by_code_dash[str(s_code)]+= counts.get('submitted',0) total_by_site_dash[site] += counts.get('submitted',0) r_dashb[str(s_code)][site] += counts.get('submitted',0) print json.dumps(total_by_code_dash , indent=2) print json.dumps(total_by_site_dash , indent=2) status_per_task = defaultdict(lambda : defaultdict(int)) if not 'AgentJobInfo' in stat: stat['AgentJobInfo'] = {} #print "bad countent ?" #print json.dumps( stat, indent=2) for agent in stat['AgentJobInfo']: for task in stat['AgentJobInfo'][agent]['tasks']: if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']: info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] if type(info)==dict: status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'][status].values()) else: status_per_task[task][status] += stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] #print json.dumps( status_per_task, indent=2) db_total_per_site = defaultdict(int) db_total_per_code = defaultdict(int) ## cannot do that since there is no task count in dashboard and we have to take away the submitted #for site in dashb: # for error in dashb[site]: # db_total_per_site[site] += dashb[site][error] # db_total_per_code[code] += dashb[site][error] print "ACDC Information" print json.dumps( where_to_run , indent=2) print json.dumps(missing_to_run , indent=2) print json.dumps(missing_to_run_at , indent=2) task_error_site_count ={} one_explanation = defaultdict(set) do_JL = True do_CL = True do_all_error_code = False if options: do_JL = not options.no_JL do_CL = not options.no_CL do_all_error_code = options.all_errors if high_order_acdc>=1: print high_order_acdc,"order request, pulling down all logs" do_all_error_code = True n_expose = 1 if options: n_expose = options.expose expose_archive_code = {'134':defaultdict(lambda : n_expose),#seg fault '139':defaultdict(lambda : n_expose),# ??? '99109':defaultdict(lambda : n_expose),#stageout '99303' : defaultdict(lambda : n_expose),#no pkl report. if you are lucky '60450' : defaultdict(lambda : n_expose),#new '50513':defaultdict(lambda : n_expose),#new '8001': defaultdict(lambda : n_expose),# the usual exception in cmsRun '11003': defaultdict(lambda : n_expose),# job extraction '73': defaultdict(lambda : n_expose),# job extraction } expose_condor_code = {'99109':defaultdict(lambda : n_expose),#stageout '99303':defaultdict(lambda : n_expose),#no pkl report '60450':defaultdict(lambda : n_expose),#new '50513':defaultdict(lambda : n_expose),#new '11003': defaultdict(lambda : n_expose), } tasks = sorted(set(err.keys() + missing_to_run.keys())) if not tasks: print "no task to look at" #return task_error_site_count html="<html> <center><h1><a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>%s</a><br><a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>%s</a><br>"%( wfn, wfn, wfi.request['PrepID'], wfi.request['PrepID'] ) if wfi.request['RequestType'] in ['ReReco']: html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a><br>'% wfi.request['PrepID'] html+= '</center><hr>' if prim: html+='Reads in primary<br>' for dataset in prim: html +='<b>%s</b>'%dataset available = getDatasetBlocksFraction(url, dataset) html +='<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>'% available html +='<ul>' presence = getDatasetPresence(url, dataset) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%'%( site, presence[site][1] ) html+='</ul><br>' if sec: html+='Reads in secondary<br>' for dataset in sec: presence = getDatasetPresence(url, dataset) html +='<b>%s</b><ul>'%dataset for site in sorted(presence.keys()): html += '<li>%s : %.2f %%'%( site, presence[site][1] ) html+='</ul>' html += "Updated on %s (GMT)" % ( time.asctime(time.gmtime()) ) html += """ <ul> <li> <b><i>dashboard numbers over %d days</b></i> <li> ↑ %% with respect to total number of error in the code <li> → %% with respect to total number of error at the site </ul> """%(dash_board_h) html += '<hr><br>' if tasks: min_rank = min([task.count('/') for task in tasks]) for task in tasks: #print task task_rank = task.count('/') task_short = task.split('/')[-1] total_per_site = defaultdict(int) for agent in stat['AgentJobInfo']: if not task in stat['AgentJobInfo'][agent]['tasks']: continue if not 'sites' in stat['AgentJobInfo'][agent]['tasks'][task]:continue for site in stat['AgentJobInfo'][agent]['tasks'][task]['sites']: info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][site] #if site in ['T2_BE_IIHE']: print task,json.dumps( info, indent=2) #print info.keys() for s in ['success','failure','cooloff','submitted']: if not s in info: continue data = info[s] #print s,data if type(data)==dict: total_per_site[site] += sum( data.values() ) else: total_per_site[site] += data #is the task relevant to recover (discard log, cleanup) if any([v in task.lower() for v in ['logcol','cleanup']]): continue total_count= defaultdict(int) error_site_count = defaultdict( lambda : defaultdict(int)) if not task in err: print task,"has not reported error" err[task] = {} #print err[task].keys() for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0' : continue #print "\t\t",err[task][exittype][errorcode_s].keys() for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site]['errorCount'] total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count error_site_count[errorcode_s][ce] += count for sample in err[task][exittype][errorcode_s][site]['samples']: #print sample.keys() for step in sample['errors']: for report in sample['errors'][step]: if report['type'] == 'CMSExeption': continue #if int(report['exitCode']) == int(errorcode_s): one_explanation[errorcode_s].add("%s (Exit code: %s) \n%s"%(report['type'], report['exitCode'], report['details'])) #one_explanation[errorcode_s].add( report['details'] ) #else: #one_explanation[ agent = sample['agent_name'] wmbs = sample['wmbsid'] workflow = sample['workflow'] if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) or do_all_error_code) and 'cern' in agent: os.system('ssh %s %s/WmAgentScripts/Unified/exec_expose.sh %s %s %s %s %s %s'%( agent, base_dir, workflow, wmbs, errorcode_s, base_dir, monitor_dir, task_short)) if errorcode_s in expose_condor_code: expose_condor_code[errorcode_s][agent]-=1 for out in sample['output']: #print out if out['type'] == 'logArchive': if do_JL and ((errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent]) or (do_all_error_code)): if errorcode_s in expose_archive_code: expose_archive_code[errorcode_s][agent]-=1 os.system('mkdir -p /tmp/%s'%(os.getenv('USER'))) local = '/tmp/%s/%s'%(os.getenv('USER'),out['lfn'].split('/')[-1]) command = 'xrdcp root://cms-xrd-global.cern.ch/%s %s'%( out['lfn'], local) ## get the file os.system( command ) ## if this actually fail, let's get the file from eos using the new log mapping ## expose the content label=out['lfn'].split('/')[-1].split('.')[0] m_dir = '%s/joblogs/%s/%s/%s/%s'%(monitor_dir, wfn, errorcode_s, task_short, label) os.system('mkdir -p %s'%(m_dir)) os.system('tar zxvf %s -C %s'%(local,m_dir)) ## truncate the content ?? for fn in os.popen('find %s -type f'%(m_dir)).read().split('\n'): if not fn: continue if any([p in fn for p in ['stdout.log']]): trunc = '/tmp/%s/%s'%(os.getenv('USER'), label) #print fn #print trunc head = tail = 1000 os.system('(head -%d ; echo;echo;echo "<snip>";echo;echo ; tail -%d ) < %s > %s'%(head, tail, fn, trunc)) os.system('mv %s %s'%(trunc, fn)) #print task #print json.dumps( total_count, indent=2) #print json.dumps( explanations , indent=2) all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add( site ) if code != '0': all_codes.add( code) ## parse the dashboard data for site in total_by_site_dash: ## no. cannot discriminate by task in dashboard... #all_sites.add( site ) pass ## parse the acdc data notreported='NotReported' all_missing_stats = set() for site in missing_to_run_at[task]: if not missing_to_run_at[task][site]: continue ce = SI.SE_to_CE( site ) #all_sites.add( ce ) all_missing_stats.add( ce ) error_site_count[notreported][ce] = 0 all_codes.add(notreported) ## no error code at that point all_missing_stats = all_missing_stats &set(SI.all_sites) all_not_reported = all_missing_stats - all_sites #print task #print "site with no report",sorted(all_not_reported) #print sorted(all_sites) #print sorted(all_missing_stats) all_sites = all_missing_stats | all_sites all_sites = all_sites & set(SI.all_sites) #success = total_count['0'] #total_jobs = sum(total_count.values()) #print total_jobs,"jobs in total,",success,"successes" #miss = "{:,}".format(missing_to_run[task]) if task in missing_to_run else "N/A" ## show the total s_per_code =defaultdict(int) for site in all_sites: for code in sorted(all_codes): s_per_code[code] += error_site_count[code][site] #no_error = (sum(s_per_code.values())==0) no_error = len(all_not_reported)!=0 if not no_error and notreported in all_codes: all_codes.remove( notreported ) missing_events = missing_to_run[task] if task in missing_to_run else 0 html += "<b>%s</b>"%task.split('/')[-1] if missing_events: html += " is missing <b>%s events</b>"%( "{:,}".format(missing_events) ) if no_error: html +="<br><b><font color=red> and has UNreported error</font></b>" html += "<br><table border=1><thead><tr><th>Sites/Errors</th>" #for site in all_sites: # html+='<th>%s</th>'%site for code in sorted(all_codes): html+='<th><a href="#%s">%s</a>'%(code,code) if str(code) in expose_archive_code or do_all_error_code: html += ' <a href=../joblogs/%s/%s/%s>, JL</a>'%( wfn, code, task_short ) if str(code) in expose_condor_code or do_all_error_code: html += ' <a href=../condorlogs/%s/%s/%s>, CL</a>'%( wfn, code, task_short ) html += '</th>' html+='<th>Total jobs</th><th>Site Ready</th>' html+='</tr></thead>\n' html+='<tr><td>Total</td>' for code in sorted(all_codes): html += '<td bgcolor=orange width=100>%d'%(s_per_code[code]) if code in total_by_code_dash: html += ' (<b><i>%d</i></b>)'% total_by_code_dash[code] html += '</td>' ulist='<ul>' grand=0 for status in sorted(status_per_task[task].keys()): ulist+='<li> %s %d'%( status, status_per_task[task][status]) grand+= status_per_task[task][status] ulist+='<li><b> Total %d </b>'%grand ulist+='</ul>' #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.) html += '<td bgcolor=orange> → %.2f%% ← </td>'% (100.*(float(sum(s_per_code.values()))/ grand) if grand else 0.) html += '<td bgcolor=orange> %s </td>'% ulist html+='</tr>' def palette(frac): _range = { 0.0 : 'green', 0.5 : 'green', 0.6 : 'darkgreen', 0.7 : 'orange', 0.8 : 'salmon', 0.9 : 'red' } which = [k for k in _range.keys() if k<=frac] if which: there = max(which) else: there=max(_range.keys()) return _range[there] for site in sorted(all_sites): site_in = 'Yes' color = 'bgcolor=lightblue' if not site in SI.sites_ready: color = 'bgcolor=indianred' site_in ='<b>No</b>' if missing_to_run_at[task][SI.CE_to_SE(site)] == 0 or min_rank == task_rank: color = 'bgcolor=aquamarine' site_in = '<b>No</b> but fine' if not no_error: site_in +=" (%s events)"%"{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)]) html+='<tr><td %s>%s</td>'%(color,site) for code in sorted(all_codes): if code == notreported: html += '<td %s width=200>%s events </td>' %(color, "{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)])) else: if error_site_count[code][site]: er_frac = float(error_site_count[code][site])/s_per_code[code] if s_per_code[code] else 0. si_frac = float(error_site_count[code][site])/total_per_site[site] if total_per_site[site] else 0. html += '<td %s width=200>%d'%(color, error_site_count[code][site]) if code in r_dashb and site in r_dashb[code]: html += ' (<b><i>%d</i></b>)'%( r_dashb[code][site] ) html += ', <font color=%s>↑ %.1f%%</font>, <font color=%s>→ %.1f%%</font></td>'% ( palette(er_frac),100.*er_frac, palette(si_frac), 100.*si_frac ) else: html += '<td %s>0</td>'% color html += '<td bgcolor=orange>%d</td>'% total_per_site[site] html += '<td %s>%s</td>'% (color, site_in) html +='</tr>\n' html+='</table><br>' task_error_site_count[task] = error_site_count html += '<hr><br>' html += "<b>Blocks (%d/%d) needed for recovery</b><br>"%( len(needed_blocks), len(all_blocks)) for block in sorted(needed_blocks): html +='%s<br>'%block html += "<br><b>Files in no block</b><br>" for f in sorted(files_notin_dbs): html +='%s<br>'%f html += '<hr><br>' html += '<table border=1>' for code in one_explanation: html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' )) #explanations[code].update( one_explanation[code] ) html+='</table>' html+=('<br>'*30) html +='</html>' wfi.sendLog( 'error', html, show=False) fn = '%s'% wfn open('%s/report/%s'%(monitor_dir,fn),'w').write( html ) return task_error_site_count, one_explanation
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific or options.early: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) if specific: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered-tried').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) wfh.sendLog('assignor',"%s to be assigned"%wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False allowed_secondary = set() for campaign in wfh.getCampaigns(): if not CI.go( campaign ): wfh.sendLog('assignor',"No go for %s"%campaign) if not options.go: n_stalled+=1 no_go = True break if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)), level='critical') if not options.go: n_stalled+=1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite in SI.sites_not_ready for osite in opportunistic_sites])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed",sorted(sites_allowed) if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints",sorted(sites_allowed) #if options.partial: # continue if not len(sites_allowed): wfh.sendLog('assignor',"cannot be assign with no matched sites") #sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'%wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read()) lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read()) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys()+lost_files.keys()): b,f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset,"has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset,"has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] try: cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) except: print "inexisting transfer statuses. starting fresh" cached_transfer_statuses = {} transfer_statuses = {} ## pop all that are now in negative values for phedexid in cached_transfer_statuses.keys(): transfers = session.query(Transfer).filter(Transfer.phedexid==int(phedexid)).all() if not transfers: print phedexid,"does not look relevant to be in cache anymore. poping" print cached_transfer_statuses.pop( phedexid ) ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']: wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus']) wfi.status='away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too print wfo.name,"is",wfi.request['RequestStatus'] sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) wfois.append( (wfo,wfi) ) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): needs[wfo.name].append( dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset)) open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## phedexid are set negative when not relevant anymore # probably there is a db schema that would allow much faster and simpler query for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': sendLog('stagor',"\t%s is staging for %s"%(transfer.phedexid, tr_wf.name)) skip=False if skip: sendLog('stagor',"setting %s to negative value"%transfer.phedexid) transfer.phedexid = -transfer.phedexid session.commit() continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: sendLog('stagor', "%s is not yet approved"%transfer.phedexid) approveSubscription(url, transfer.phedexid) continue ## check on transfer completion if str(transfer.phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor',"read %s from cache"%transfer.phedexid) checks = cached_transfer_statuses[str(transfer.phedexid)] else: checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) ## just write this out transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: sendLog('stagor',"Checks for %s are %s"%( transfer.phedexid, [node.values() for node in checks.values()])) done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done ## for those that are in staging, and the destination site is in drain #if not done and tr_wf.status == 'staging': for ds in checks: for s,v in checks[ds].items(): dataset_endpoints[ds].add( s ) if done: ## transfer.status = 'done' sendLog('stagor',"%s is done"%transfer.phedexid) cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) else: sendLog('stagor',"%s is not finished %s"%(transfer.phedexid, pprint.pformat( checks ))) pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough] sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready] if sites_incomplete_down: sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds)) #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough] #endpoint[ds].update( sites_complete ) #endpoint[ds].update( sites_incomplete ) #endpoint_completed[ds].update( sites_complete ) endpoint_incompleted[ds].update( sites_incomplete ) endpoint_in_downtime[ds].update( sites_incomplete_down ) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps( endpoint_in_downtime , indent=2) open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)) already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ) missing_in_action = defaultdict(list) print "-"*10,"Checking on workflows in staging","-"*10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda : defaultdict(float)) presence_cache = defaultdict(dict) for wfo,wfi in wfois: print "#"*30 ## the site white list takes site, campaign, memory and core information (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys={} for need in list(primaries)+list(secondaries): if not need in done_by_input: wfi.sendLog('stagor',"missing transfer report for %s"%need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report','%s does not have a transfer report'%(need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need) readys[need] = True done_by_input[need] = { "fake" : True } continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor',"%s is ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = True else: wfi.sendLog('stagor',"%s is not ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor',"all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor',"all needs are fullfilled, already") print json.dumps( readys, indent=2 ) else: wfi.sendLog('stagor',"missing requirements") copies_needed,_ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries):#+list(secondaries): if endpoint_in_downtime[need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need) re_transfer=True if not se_allowed_key in available_cache[need]: available_cache[need][se_allowed_key] = getDatasetBlocksFraction( url , need, sites=se_allowed ) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need) jump_ahead = True ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries)+list(secondaries): if need in already_stuck: wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead"%wfo.name details_text += '\n%s wants %s copies'%(wfo.name,copies_needed) copies_needed = max(1,copies_needed-1) details_text += '\nlowering by one unit to %s'%copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update( presence.keys() ) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog('stagor', "%s is available %s times %s"%( need, available, this_check)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon this_check = all(done_by_input[need].values()) wfi.sendLog('stagor',"%s is all transfered %s"%(need, json.dumps(done_by_input[need], indent=2))) all_check&= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check: wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name,"has to wait a bit more" wfi.sendLog('stagor',"needs to wait a bit more") else: wfi.sendLog('stagor',"not checking availability") if re_transfer: wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime") if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add( wfo.name ) if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical') print "-"*10,"Checking on non-available datasets","-"*10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min( available_cache[dsname].values() ) for dsname,available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical') else: print "would send",wf.name,"back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete",dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names) print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='warning') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() #sendEmail('doomed workflow','%s has too much loss on the input dataset %s. please check on stagor logs https://cmst2.web.cern.ch/cmst2/unified/logs/stagor/last.log'%(wf.name, dsname)) sendLog('stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical') known_lost_blocks[dsname] = [i['name'] for i in lost_blocks] if lost_files: fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names) print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [i['name'] for i in lost_files] ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",len(done_by_input[dsname]) print "\tgot",done_by_input[dsname].values().count(True) print "\tmissing",missings missing_in_action[dsname].extend( missings ) rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_blocks, indent=2)) rr.close() rr= open('%s/lost_files_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_files, indent=2)) rr.close() open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) print "Going further and make a report of stuck transfers" datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" really_stuck_dataset = set() transfer_timeout = UC.get("transfer_timeout") transfer_lowrate = UC.get("transfer_lowrate") for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>transfer_timeout and rate<transfer_lowrate: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) print "add",dataset,"to really stuck" report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print '\n'*2,"Datasets really stuck" print '\n'.join( really_stuck_dataset ) print '\n'*2,"report written at https://cmst2.web.cern.ch/cmst2/unified/logs/incomplete_transfers.log" print report stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]) print '\n'*2,'Stuck dataset transfers' print json.dumps(stuck_transfers , indent=2) open('%s/stuck_transfers.json'%monitor_dir,'w').write( json.dumps(stuck_transfers , indent=2) ) open('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report )
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary)), level='critical') if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[ wfh.request['Campaign']]: primary_aaa = primary_aaa or CI.campaigns[ wfh.request['Campaign']]['primary_AAA'] secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[ wfh.request['Campaign']]: secondary_aaa = secondary_aaa or CI.campaigns[ wfh.request['Campaign']]['secondary_AAA'] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog( 'assignor', "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list( set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( 'assignor', 'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting' % wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( 'assignor', "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all()) if specific: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all()) dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read()) max_per_round = UC.get("max_per_round").get("assignor", None) max_cpuh_block = UC.get("max_cpuh_block") random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(","))): continue # if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog("assignor", "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]: allowed_secondary.update(CI.campaigns[campaign]["secondaries"]) if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary))) # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog( "assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)), level="critical", ) if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request["RequestStatus"] != "assignment-approved": if not options.test: wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request["RequestStatus"] wfo.status = "away" session.commit() continue else: print wfo.name, wfh.request["RequestStatus"] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog("assignor", "cannot decide on version number") n_stalled += 1 wfo.status = "trouble" session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", []) blocks = [] if "BlockWhitelist" in wfh.request: blocks = wfh.request["BlockWhitelist"] if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"]))) wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"] secondary_aaa = options.secondary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") # pass if secondary_aaa: # just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0] # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = "/store/mc" ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0] ] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog( "assignor", "Holding the data but not allowed %s" % sorted( list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) ), ) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( (set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]) ) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( "assignor", "One of the usable site is in downtime %s" % ([osite in SI.sites_not_ready for osite in opportunistic_sites]), ) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog("assignor", "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( "assignor", "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh), level="critical", ) wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "maxcopies" in CI.campaigns[wfh.request["Campaign"]] ): copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]): not_even_once = not all([available >= 1.0 for available in available_fractions.values()]) wfh.sendLog( "assignor", "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()), ) if down_time and not options.go and not options.early: wfo.status = "considered" session.commit() wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting") # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( "assignor", "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered." % (wfo.name), level="delay", ) continue # pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open("cannot_assign.json").read()) except: pass if ( not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial ): wfh.sendLog( "assignor", "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) known.append(wfo.name) open("cannot_assign.json", "w").write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == "considered": wfh.sendLog("assignor", "setting considered-tried") wfo.status = "considered-tried" session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog("assignor", "cannot be assign with no matched sites") sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical") n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith("T1")] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog("assignor", "Placing the output on %s" % sites_out) parameters = { "SiteWhitelist": sites_allowed, "NonCustodialSites": sites_out, "AutoApproveSubscriptionSites": list(set(sites_out)), "AcquisitionEra": wfh.acquisitionEra(), "ProcessingString": wfh.processingString(), "MergedLFNBase": set_lfn, "ProcessingVersion": version, } ## plain assignment here team = "production" if os.getenv("UNIFIED_TEAM"): team = os.getenv("UNIFIED_TEAM") if options and options.team: team = options.team if False and "T2_CH_CERN" in parameters["SiteWhitelist"]: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"] team = "hlt" ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and "," in v: parameters[key] = filter(None, v.split(",")) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog("assignor", "Setting the number of events per job to 500k max") parameters["EventsPerJob"] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request["Campaign"])) if not options.test: parameters["execute"] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if "EventBased" in split_check.values(): wfh.sendLog("assignor", "Falling back to event splitting.") # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name, level="critical", ) elif "EventsPerJob" in split_check.values(): wfh.sendLog("assignor", "Modifying the number of job per event") # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical" ) # Handle run-dependent MC pstring = wfh.processingString() if "PU_RD" in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if "PU_RD2" in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog( "assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level="critical", ) wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters["EventsPerJob"] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog( "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical" ) wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters["LumisPerJob"] = lumisPerJob else: # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( "assignor", "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level="critical", ) wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = "away" session.commit() n_assigned += 1 wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]: ## lock all outputs flat NLI.lock(secure) # for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def parse_one(url, wfn, options=None): def time_point(label="", sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "[showError] Time check (%s) point at : %s" % (label, nows) print "[showError] Since start: %s [s]" % (now - time_point.start) if sub_lap: print "[showError] Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "[showError] Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) task_error_site_count = {} one_explanation = defaultdict(set) per_task_explanation = defaultdict(set) if wfn in [ 'vlimant_task_EXO-RunIISummer15wmLHEGS-04800__v1_T_170906_141738_1357' ]: return task_error_site_count, one_explanation time_point("Starting with %s" % wfn) threads = [] SI = global_SI() UC = unifiedConfiguration() wfi = workflowInfo(url, wfn) time_point("wfi", sub_lap=True) where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo() time_point("acdcinfo", sub_lap=True) all_blocks, needed_blocks_loc, files_in_blocks, files_and_loc_notin_dbs = wfi.getRecoveryBlocks( ) time_point("inputs", sub_lap=True) ancestor = workflowInfo(url, wfn) lhe, prim, _, sec = ancestor.getIO() high_order_acdc = 0 while ancestor.request['RequestType'] == 'Resubmission': ancestor = workflowInfo(url, ancestor.request['OriginalRequestName']) lhe, prim, _, sec = ancestor.getIO() high_order_acdc += 1 no_input = (not lhe) and len(prim) == 0 and len(sec) == 0 cache = options.cache print "cache timeout", cache err = wfi.getWMErrors(cache=cache) time_point("wmerrors", sub_lap=True) stat = wfi.getWMStats(cache=cache) time_point("wmstats", sub_lap=True) #adcd = wfi.getRecoveryDoc() total_by_code_dash = defaultdict(int) total_by_site_dash = defaultdict(int) r_dashb = defaultdict(lambda: defaultdict(int)) dash_board_h = 1 if False: ## NB get the since from when the wf has started, not a fixed value ## no dashboard until we get a better api #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache) dashb = {} #print json.dumps( dashb , indent=2) for site, sinfo in dashb.items(): for s_code, counts in sinfo.items(): d_statuses = ['submitted', 'pending', 'app-unknown', 'done'] total_by_code_dash[str(s_code)] += counts.get('submitted', 0) total_by_site_dash[site] += counts.get('submitted', 0) r_dashb[str(s_code)][site] += counts.get('submitted', 0) print json.dumps(total_by_code_dash, indent=2) print json.dumps(total_by_site_dash, indent=2) time_point("Got most input") status_per_task = defaultdict(lambda: defaultdict(int)) if not 'AgentJobInfo' in stat: stat['AgentJobInfo'] = {} #print "bad countent ?" #print json.dumps( stat, indent=2) for agent in stat['AgentJobInfo']: for task in stat['AgentJobInfo'][agent]['tasks']: if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']: info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][ status] #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status] if type(info) == dict: status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'] [status].values()) else: status_per_task[task][status] += stat['AgentJobInfo'][ agent]['tasks'][task]['status'][status] #print json.dumps( status_per_task, indent=2) db_total_per_site = defaultdict(int) db_total_per_code = defaultdict(int) ## cannot do that since there is no task count in dashboard and we have to take away the submitted #for site in dashb: # for error in dashb[site]: # db_total_per_site[site] += dashb[site][error] # db_total_per_code[code] += dashb[site][error] print "ACDC Information" print "\t where to re-run" print json.dumps(where_to_run, indent=2) print "\t Missing events" print json.dumps(missing_to_run, indent=2) print "\t Missing events per site" print json.dumps(missing_to_run_at, indent=2) if not where_to_run and not missing_to_run and not missing_to_run_at: print "showError is unable to run" #return task_error_site_count, one_explanation pass do_JL = not options.no_JL do_CL = not options.no_CL do_all_error_code = options.all_errors if high_order_acdc >= 1: print high_order_acdc, "order request, pulling down all logs" do_all_error_code = True if wfi.isRelval(): print "getting all codes for relval" do_all_error_code = True tasks = sorted(set(err.keys() + missing_to_run.keys())) if not tasks: print "no task to look at" #return task_error_site_count html = "<html> <center><h1>%s, Updated on %s (GMT)" % ( wfn, time.asctime(time.gmtime())) html += '</center>' html += '<a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>dts</a>, ' % ( wfn) html += '<a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>ac</a>, ' % ( wfi.request['PrepID']) html += '<a href=https://cms-gwmsmon.cern.ch/prodview/%s>Job Progress</a>, ' % ( wfn) r_type = wfi.request.get('OriginalRequestType', wfi.request.get('RequestType', 'NaT')) if r_type in ['ReReco']: html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a>, ' % wfi.request[ 'PrepID'] html += '<a href="https://its.cern.ch/jira/issues/?jql=text~%s AND project = CMSCOMPPR" target="_blank">jira</a>' % ( wfi.request['PrepID']) html += '<hr>' html += '<a href=#IO>I/O</a>, <a href=#ERROR>Errors</a>, <a href=#BLOCK>blocks</a>, <a href=#FILE>files</a>, <a href=#CODES>Error codes</a><br>' html += '<hr>' time_point("Header writen") html += '<a name=IO></a>' if prim: html += 'Reads in primary<br>' rwl = wfi.getRunWhiteList() lwl = wfi.getLumiWhiteList() for dataset in prim: html += '<b>%s </b>(events/lumi ~%d)' % ( dataset, getDatasetEventsPerLumi(dataset)) blocks = getDatasetBlocks(dataset, runs=rwl) if rwl else None blocks = getDatasetBlocks(dataset, lumis=lwl) if lwl else None available = getDatasetBlocksFraction(url, dataset, only_blocks=blocks) html += '<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>' % available html += '<ul>' presence = getDatasetPresence(url, dataset, only_blocks=blocks) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul><br>' if sec: html += 'Reads in secondary<br>' for dataset in sec: presence = getDatasetPresence(url, dataset) html += '<b>%s</b><ul>' % dataset for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' outs = sorted(wfi.request['OutputDatasets']) if outs: html += 'Produces<br>' for dataset in outs: presence = getDatasetPresence(url, dataset) html += '<b>%s </b>(events/lumi ~ %d)<ul>' % ( dataset, getDatasetEventsPerLumi(dataset)) for site in sorted(presence.keys()): html += '<li>%s : %.2f %%' % (site, presence[site][1]) html += '</ul>' time_point("Input checked") html += """ <hr><br> <a name=ERROR></a> <ul> <li> <b><i>dashboard numbers over %d days</b></i> <li> ↑ %% with respect to total number of error in the code <li> → %% with respect to total number of error at the site </ul> """ % (dash_board_h) html += '<br>' n_expose_base = options.expose # if options else UC.get('n_error_exposed') print "getting", n_expose_base, "logs by default" if tasks: min_rank = min([task.count('/') for task in tasks]) for task in tasks: n_expose = n_expose_base expose_archive_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_archive_code')]) expose_condor_code = dict([(str(code), defaultdict(lambda: n_expose)) for code in UC.get('expose_condor_code')]) #print task task_rank = task.count('/') task_short = task.split('/')[-1] total_per_site = defaultdict(int) time_point("Starting with task %s" % task_short, sub_lap=True) notreported = 'NotReported' total_count = defaultdict(int) error_site_count = defaultdict(lambda: defaultdict(int)) all_not_reported = set() for agent in stat['AgentJobInfo']: for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'skipped', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['skipped'][ site] #print info all_not_reported.add(site) ce = SI.SE_to_CE(site) error_site_count[notreported][ce] += info.get( 'skippedFiles', 0) total_count[notreported] += info.get('skippedFiles', 0) for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get( 'sites', {}): info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][ site] for s in ['success', 'failure', 'cooloff', 'submitted']: if not s in info: continue data = info[s] if type(data) == dict: total_per_site[site] += sum(data.values()) else: total_per_site[site] += data #is the task relevant to recover (discard log, cleanup) if any([v in task.lower() for v in ['logcol', 'cleanup']]): continue #total_count= defaultdict(int) #error_site_count = defaultdict( lambda : defaultdict(int)) if not task in err: print task, "has not reported error" err[task] = {} #print err[task].keys() for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count error_site_count[errorcode_s][ce] += count ## show the total all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) s_per_code = defaultdict(int) for site in all_sites: for code in sorted(all_codes): s_per_code[code] += error_site_count[code][site] expose_top_N = UC.get('expose_top_N') count_top_N = min( sorted(s_per_code.values(), reverse=True)[:expose_top_N]) if s_per_code else -1 for exittype in err[task]: #print "\t",err[task][exittype].keys() for errorcode_s in err[task][exittype]: if errorcode_s == '0': continue #print "\t\t",err[task][exittype][errorcode_s].keys() force_code = (count_top_N > 0 and s_per_code[errorcode_s] >= count_top_N) if force_code: print "will expose", errorcode_s, "anyways" for site in err[task][exittype][errorcode_s]: ce = SI.SE_to_CE(site) count = err[task][exittype][errorcode_s][site][ 'errorCount'] ###total_count[errorcode_s] += count #error_site_count[errorcode_s][site] += count ###error_site_count[errorcode_s][ce] += count for sample in err[task][exittype][errorcode_s][site][ 'samples']: #print sample.keys() for step in sample['errors']: for report in sample['errors'][step]: if report['type'] == 'CMSExeption': continue #if int(report['exitCode']) == int(errorcode_s): one_explanation[errorcode_s].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) per_task_explanation[ "%s:%s" % (task_short, errorcode_s)].add( "%s (Exit code: %s) \n%s" % (report['type'], report['exitCode'], report['details'])) #one_explanation[errorcode_s].add( report['details'] ) #else: #one_explanation[ agent = sample['agent_name'] wmbs = sample['wmbsid'] workflow = sample['workflow'] if force_code: if not errorcode_s in expose_condor_code: expose_condor_code[errorcode_s] = defaultdict( lambda: n_expose) if not errorcode_s in expose_archive_code: expose_archive_code[errorcode_s] = defaultdict( lambda: n_expose) if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) ) and 'cern' in agent: if errorcode_s in expose_condor_code: expose_condor_code[errorcode_s][agent] -= 1 print errorcode_s, agent, "error count", expose_condor_code.get( errorcode_s, {}).get(agent, 0) threads.append( AgentBuster(agent=agent, workflow=workflow, wmbs=wmbs, errorcode_s=errorcode_s, base_eos_dir=base_eos_dir, monitor_eos_dir=monitor_eos_dir, task_short=task_short)) for out in sample['output']: #print out if out['type'] == 'logArchive': if do_JL and ( (errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent] > 0)): if errorcode_s in expose_archive_code: expose_archive_code[errorcode_s][ agent] -= 1 print errorcode_s, agent, "error count", expose_archive_code.get( errorcode_s, {}).get(agent, 0) threads.append( XRDBuster( out_lfn=out['lfn'], monitor_eos_dir=monitor_eos_dir, wfn=wfn, errorcode_s=errorcode_s, task_short=task_short, from_eos=( not options.not_from_eos ), # if options else True), )) #print task #print json.dumps( total_count, indent=2) #print json.dumps( explanations , indent=2) all_sites = set() all_codes = set() for code in error_site_count: for site in error_site_count[code]: all_sites.add(site) if code != '0': all_codes.add(code) ## parse the dashboard data for site in total_by_site_dash: ## no. cannot discriminate by task in dashboard... #all_sites.add( site ) pass ## parse the acdc data #notreported='NotReported' #all_missing_stats = set() #for site in missing_to_run_at[task] if task in missing_to_run_at else []: # if not missing_to_run_at[task][site]: continue # ce = SI.SE_to_CE( site ) # #all_sites.add( ce ) # all_missing_stats.add( ce ) #all_missing_stats = all_missing_stats &set(SI.all_sites) #all_not_reported = all_missing_stats - all_sites #print task #print "site with no report",sorted(all_not_reported) #print sorted(all_sites) #print sorted(all_missing_stats) #all_sites = all_missing_stats | all_sites #all_sites = all_sites & set(SI.all_sites) no_error = len(all_not_reported) != 0 if not no_error and notreported in all_codes: all_codes.remove(notreported) missing_events = missing_to_run[task] if task in missing_to_run else 0 feff = wfi.getFilterEfficiency(task.split('/')[-1]) html += "<a name=%s>" % task.split('/')[-1] html += "<b>%s</b>" % task.split('/')[-1] if missing_events: if feff != 1.: html += ' is missing %s events in input and <b>about %s events in output</b>' % ( "{:,}".format(missing_events), "{:,}".format( int(missing_events * feff))) else: html += ' is missing <b>%s events in I/O</b>' % ( "{:,}".format(missing_events)) html += ' <a href="https://cmsweb.cern.ch/couchdb/acdcserver/_design/ACDC/_view/byCollectionName?key=%%22%s%%22&include_docs=true&reduce=false" target=_blank>AC/DC</a>' % ( wfn) if no_error: html += "<br><b><font color=red> and has UNreported error</font></b>" html += "<br><table border=1><thead><tr><th>Sites/Errors</th>" #for site in all_sites: # html+='<th>%s</th>'%site for code in sorted(all_codes): #html+='<th><a href="#%s">%s</a>'%(code,code) html += '<th><a href="#%s:%s">%s</a>' % (task_short, code, code) if (str(code) in expose_archive_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/joblogs/%s/%s/%s>, JobLog</a>' % ( unified_url_eos, wfn, code, task_short) if (str(code) in expose_condor_code or do_all_error_code): # and n_expose_base: html += ' <a href=%s/condorlogs/%s/%s/%s>, CondorLog</a>' % ( unified_url_eos, wfn, code, task_short) html += '</th>' html += '<th>Total jobs</th><th>Site Ready</th>' html += '</tr></thead>\n' html += '<tr><td>Total</td>' for code in sorted(all_codes): html += '<td bgcolor=orange width=100>%d' % (s_per_code[code]) if code in total_by_code_dash: html += ' (<b><i>%d</i></b>)' % total_by_code_dash[code] html += '</td>' ulist = '<ul>' grand = 0 for status in sorted(status_per_task[task].keys()): ulist += '<li> %s %d' % (status, status_per_task[task][status]) grand += status_per_task[task][status] ulist += '<li><b> Total %d </b>' % grand ulist += '</ul>' #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.) html += '<td bgcolor=orange> → %.2f%% ← </td>' % ( 100. * (float(sum(s_per_code.values())) / grand) if grand else 0.) html += '<td bgcolor=orange> %s </td>' % ulist html += '</tr>' def palette(frac): _range = { 0.0: 'green', 0.5: 'green', 0.6: 'darkgreen', 0.7: 'orange', 0.8: 'salmon', 0.9: 'red' } which = [k for k in _range.keys() if k <= frac] if which: there = max(which) else: there = max(_range.keys()) return _range[there] for site in sorted(all_sites): site_in = 'Yes' color = 'bgcolor=lightblue' if not site in SI.sites_ready: color = 'bgcolor=indianred' site_in = '<b>No</b>' if task in missing_to_run_at and missing_to_run_at[task][ SI.CE_to_SE(site)] == 0 or min_rank == task_rank: color = 'bgcolor=aquamarine' site_in = '<b>No</b> but fine' if not no_error: site_in += " (%s events)" % ("{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)]) if task in missing_to_run_at else '--') html += '<tr><td %s>%s</td>' % (color, site) for code in sorted(all_codes): if code == notreported: html += '<td %s width=200>%s events </td>' % ( color, "{:,}".format( missing_to_run_at[task][SI.CE_to_SE(site)])) else: if error_site_count[code][site]: er_frac = float( error_site_count[code][site] ) / s_per_code[code] if s_per_code[code] else 0. si_frac = float( error_site_count[code][site]) / total_per_site[ site] if total_per_site[site] else 0. html += '<td %s width=200>%d' % ( color, error_site_count[code][site]) if code in r_dashb and site in r_dashb[code]: html += ' (<b><i>%d</i></b>)' % ( r_dashb[code][site]) html += ', <font color=%s>↑ %.1f%%</font>, <font color=%s>→ %.1f%%</font></td>' % ( palette(er_frac), 100. * er_frac, palette(si_frac), 100. * si_frac) else: html += '<td %s>0</td>' % color html += '<td bgcolor=orange>%d</td>' % total_per_site[site] html += '<td %s>%s</td>' % (color, site_in) html += '</tr>\n' html += '</table><br>' task_error_site_count[task] = error_site_count ## run all retrieval run_threads = ThreadHandler( threads=threads, n_threads=options.log_threads, # if options else 5, sleepy=10, timeout=UC.get('retrieve_errors_timeout'), verbose=True) run_threads.start() html += '<hr><br>' html += '<a name=BLOCK></a>' html += "<b>Blocks (%d/%d) needed for recovery</b><br>" % ( len(needed_blocks_loc), len(all_blocks)) for block in sorted(needed_blocks_loc.keys()): html += '%s <b>@ %s</b><br>' % (block, ','.join( sorted(needed_blocks_loc[block]))) html += '<a name=FILE></a>' html += "<br><b>Files in no block</b><br>" rthreads = [] check_files = [f for f in files_and_loc_notin_dbs.keys() if '/store' in f] random.shuffle(check_files) check_files = check_files[:100] check_files = [] ## disable it completely by_f = {} if check_files: for f in check_files: rthreads.append(ReadBuster(file=f)) print "checking on existence of", len(rthreads), "files" run_rthreads = ThreadHandler(threads=rthreads, n_threads=20, timeout=10) run_rthreads.start() while run_rthreads.is_alive(): time.sleep(10) for t in run_rthreads.threads: by_f[t.file] = t.readable #print "checked",t.file,t.readable for f in sorted(files_and_loc_notin_dbs.keys()): readable = by_f.get(f, -1) if readable == -1: fs = '%s' % f elif readable == 0: fs = '<font color=light green>%s</font>' % f #print f,"is readable" else: fs = '<font color=red>%s</font>' % f #print f,"is not readable" html += '%s <b>@</b> %s<br>' % (fs, ','.join( sorted(files_and_loc_notin_dbs[f]))) #html +='%s <b>@</b> %s<br>'%(f , ','.join(sorted(files_and_loc_notin_dbs[f])) ) html += '<hr><br>' html += '<a name=CODES></a>' html += '<table border=1>' for code in per_task_explanation: html += '<tr><td><a name="%s">%s</a><br><a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes>code twiki</a></td><td>%s</td></tr>' % ( code, code, '<br><br>'.join(per_task_explanation[code]).replace( '\n', '<br>')) #for code in one_explanation: # html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' )) html += '</table>' html += ('<br>' * 30) html += '</html>' time_point("Report finished") wfi.sendLog('error', html, show=False) fn = '%s' % wfn time_point("error send to ES") open('%s/report/%s' % (monitor_dir, fn), 'w').write(html) open('%s/report/%s' % (monitor_eos_dir, fn), 'w').write(html) time_point("Finished with showError") ## then wait for the retrivals to complete ping = 0 while run_threads.is_alive(): ping += 1 if ping % 100: time_point("waiting for sub-threads to finish") time.sleep(6) time_point("Finished with retrieval threads") return task_error_site_count, one_explanation
def assignor(url ,specific = None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name,"to be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': print wfo.name,wfh.request['RequestStatus'],"skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print "Allowed",sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial)==0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial)>1: print "more than one custodial for",wfo.name sys.exit(36) secondary_locations=None for sec in list(secondary): presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence( url, prim ) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] ) sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_with_any_data = [site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()])] if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed))] print "We could be running at",opportunistic_sites,"in addition" if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if not options.go: continue ## default back to white list to original white list with any data print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" continue parameters={ 'SiteWhitelist' : sites_allowed, 'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion' : version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team='production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign",wfo.name else: pass
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read()) lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read()) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys()+lost_files.keys()): b,f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset,"has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset,"has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] try: cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) except: print "inexisting transfer statuses. starting fresh" cached_transfer_statuses = {} return False transfer_statuses = {} def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) time_point("Check cached transfer") ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) needs_by_priority = defaultdict(list) for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']: wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus']) wfi.status='away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too print wfo.name,"is",wfi.request['RequestStatus'] sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) wfois.append( (wfo,wfi) ) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): needs[wfo.name].append( dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} needs_by_priority[wfi.request['RequestPriority']].append( dataset ) wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset)) time_point("Check staging workflows") open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)) for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio])) open('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## first check if anything is inactive all_actives = set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()]) for active_phedexid in all_actives: skip = True transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == active_phedexid).all() for imp in transfers_phedexid: if imp.workflow.status == 'staging': skip =False sendLog('stagor',"\t%s is staging for %s"%(imp.phedexid, imp.workflow.name)) if skip: sendLog('stagor',"setting %s inactive" % active_phedexid) for imp in transfers_phedexid: imp.active = False session.commit() all_actives = sorted(set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()])) for phedexid in all_actives: if specific: continue ## check on transfer completion not_cached = False if str(phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor',"read %s from cache"%phedexid) checks = cached_transfer_statuses[str(phedexid)] else: ## I actually would like to avoid that all I can sendLog('stagor','Performing spurious transfer check on %s'% phedexid, level='critical') checks = checkTransferStatus(url, phedexid, nocollapse=True) if not checks: ## this is going to bias quite heavily the rest of the code. we should abort here #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical') #return False sendLog('stagor','Stagor has got a skewed input from checkTransferStatus', level='critical') checks = {} pass #checks = {} #not_cached = True time_point("Check transfer status %s"% phedexid, sub_lap=True) ## just write this out transfer_statuses[str(phedexid)] = copy.deepcopy(checks) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][phedexid]=checks[dsname].values() if checks: sendLog('stagor',"Checks for %s are %s"%( phedexid, [node.values() for node in checks.values()])) done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? if not_cached: print "Transfer status was not cached" else: print "ERROR with the scubscriptions API of ",phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == phedexid).all() for imp in transfers_phedexid: tr_wf = imp.workflow if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][phedexid]=done if done: imp.active = False session.commit() for ds in checks: for s,v in checks[ds].items(): dataset_endpoints[ds].add( s ) if done: sendLog('stagor',"%s is done"%phedexid) cached_transfer_statuses[str(phedexid)] = copy.deepcopy(checks) else: sendLog('stagor',"%s is not finished %s"%(phedexid, pprint.pformat( checks ))) ##pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough] sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready] ## no space means no transfer should go there : NO, it does not work in the long run #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))] if sites_incomplete_down: sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds)) #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough] #endpoint[ds].update( sites_complete ) #endpoint[ds].update( sites_incomplete ) #endpoint_completed[ds].update( sites_complete ) endpoint_incompleted[ds].update( sites_incomplete ) endpoint_in_downtime[ds].update( sites_incomplete_down ) time_point("Check on-going transfers") print "End points" for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps( dataset_endpoints , indent=2) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) print json.dumps( endpoint_in_downtime , indent=2) print "End point incomplete in down time" for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k]) print json.dumps( endpoint_incompleted , indent=2) #open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)) already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() ).keys() already_stuck.extend( getAllStuckDataset() ) missing_in_action = defaultdict(list) print "-"*10,"Checking on workflows in staging","-"*10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda : defaultdict(float)) presence_cache = defaultdict(dict) time_point("Preparing for more") for wfo,wfi in wfois: print "#"*30 time_point("Forward checking %s"% wfo.name,sub_lap=True) ## the site white list takes site, campaign, memory and core information (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys={} for need in list(primaries)+list(secondaries): if not need in done_by_input: wfi.sendLog('stagor',"missing transfer report for %s"%need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report','%s does not have a transfer report'%(need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need) readys[need] = True done_by_input[need] = { "fake" : True } continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor',"%s is ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = True else: wfi.sendLog('stagor',"%s is not ready \n%s"%(need,json.dumps( done_by_input[need] , indent=2))) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor',"all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor',"all needs are fullfilled, already") print json.dumps( readys, indent=2 ) else: wfi.sendLog('stagor',"missing requirements") copies_needed,_ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries):#+list(secondaries): if endpoint_in_downtime[need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need) re_transfer=True if not se_allowed_key in available_cache[need]: available_cache[need][se_allowed_key] = getDatasetBlocksFraction( url , need, sites=se_allowed ) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need) jump_ahead = True else: wfi.sendLog('stagor',"Available %s times"% available_cache[need][se_allowed_key]) missing_and_downtime = list(set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need])) if missing_and_downtime: wfi.sendLog('stagor',"%s is incomplete at %s which is in downtime, trying to move along"%(need, ','.join(missing_and_downtime))) jump_ahead = True else: wfi.sendLog('stagor',"continue waiting for transfers for optimum production performance.") ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries)+list(secondaries): if need in already_stuck: wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead"%wfo.name details_text += '\n%s wants %s copies'%(wfo.name,copies_needed) copies_needed = max(1,copies_needed-1) details_text += '\nlowering by one unit to %s'%copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update( presence.keys() ) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog('stagor', "%s is available %s times (%s), at %s"%( need, available, this_check, se_allowed_key)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point this_check = all(done_by_input[need].values()) wfi.sendLog('stagor',"%s is this much transfered %s"%(need, json.dumps(done_by_input[need], indent=2))) all_check&= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check and not re_transfer: wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name,"has to wait a bit more" wfi.sendLog('stagor',"needs to wait a bit more") else: wfi.sendLog('stagor',"not checking availability") if re_transfer: wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime") if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add( wfo.name ) time_point("Checked affected workflows") if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical') print "-"*10,"Checking on non-available datasets","-"*10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min( available_cache[dsname].values() ) really_stuck_dataset = set() for dsname,available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical') else: print "would send",wf.name,"back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete",dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) if (not dsname.endswith('/RAW')) else ([],[]) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names) print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() sendLog('stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical') known_lost_blocks[dsname] = [i['name'] for i in lost_blocks] really_stuck_dataset.add( dsname ) if lost_files: fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names) print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [i['name'] for i in lost_files] ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",len(done_by_input[dsname]) print "\tgot",done_by_input[dsname].values().count(True) print "\tmissing",missings missing_in_action[dsname].extend( missings ) rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_blocks, indent=2)) rr.close() rr= open('%s/lost_files_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_files, indent=2)) rr.close() open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) for k in datasets_by_phid: datasets_by_phid[k] = list(datasets_by_phid[k]) open('datasets_by_phid.json','w').write( json.dumps(datasets_by_phid, indent=2 )) open('really_stuck_dataset.json','w').write( json.dumps(list(really_stuck_dataset), indent=2 )) print '\n'*2,"Datasets really stuck" print '\n'.join( really_stuck_dataset ) ############# ## not going further for what matters ############# return
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = False for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) if not primary_aaa: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) if 'parameters' in assign_parameters: parameters.update( assign_parameters['parameters'] ) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters #parameters.update( CI.parameters(wfh.request['Campaign']) ) parameters.update( assign_parameters.get('parameters',{}) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'NoGo' in split_check.values(): wfh.sendLog('assignor', "Failing splitting check") sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical') n_stalled+=1 continue if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical') ## we have a problem here, that EventBased should never be used as a backup if not options.go: n_stalled+=1 continue continue ## skip all together elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per job") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') elif 'EventsPerLumi' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this") # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat #NLI.lock( secure ) LI.lock( secure, reason = 'assigning') #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign. Please check the logs") print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n",wfo.name,"\n\tto be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: print wfo.name,wfh.request['RequestStatus'],"setting away and skipping" ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" n_stalled+=1 continue #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() print "Site white list",sorted(sites_allowed) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) c_sites_allowed = CI.get(wfh.request['Campaign'], 'SiteWhitelist' , []) if c_sites_allowed: print "Would like to use the new whitelist, but will not until things went through a bit" sendEmail("using a restricted site white list","for %s"%(c_sites_allowed)) sites_allowed = list(set(sites_allowed) & set(c_sites_allowed)) c_black_list = CI.get(wfh.request['Campaign'], 'SiteBlacklist', []) if c_black_list: print "Reducing the whitelist due to black list in campaign configuration" print "Removing",c_black_list sites_allowed = list(set(sites_allowed) - set(c_black_list)) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] ncores = wfh.request.get('Multicore',1) memory_allowed = SI.sitesByMemory( wfh.request['Memory'] , maxCore=ncores) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"MB and",ncores,"core are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) print "Allowed",sorted(sites_allowed) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] print "From secondary requirement, now Allowed",sorted(sites_allowed) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] print "We could be running at",sorted(opportunistic_sites),"in addition" if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): print "One of the destination site is in downtime" down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if down_time and not options.go: wfo.status = 'considered' session.commit() print "sending back to considered because of site downtime, instead of waiting" sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## default back to white list to original white list with any data print "Allowed",sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed options.useSiteListAsLocation = True else: sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] print "Placing the output on", sites_out parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if options and options.team: team = options.team #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): ## consider SDSC parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] parameters['useSiteListAsLocation'] = True team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0: parameters['SiteWhitelist'] = ['T3_US_SDSC'] team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): print "Falling back to event splitting." sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) elif 'EventsPerJob' in split_check.values(): print "Modifying the number of job per event" sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: print "There is no go for assigning that request without event splitting" sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) print "need to go down to",eventsPerJob,"events per job" parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: print "need to go down to",lumisPerJob,"in assignment" sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: print "the regular splitting should work for",pstring sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" print "Assigned",n_assigned print "Stalled",n_stalled
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = global_SI CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost = json.loads(open('lost_blocks_datasets.json').read()) still_lost = [] for dataset in lost: l = findLostBlocks(url ,dataset) if not l: print dataset,"is not really lost" else: still_lost.append( dataset ) open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) ) cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) if options.fast: print "doing the fast check of staged with threshold:",options.goodavailability for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): if specific and not specific in wfo.name: continue wfi = workflowInfo(url, wfo.name) (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList() if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']): sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist'])) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] all_check = True n_copies = wfi.getNCopies() for dataset in list(primaries):#+list(secondaries) ? #print se_allowed available = getDatasetBlocksFraction( url , dataset , sites=se_allowed ) #all_check &= (available >= options.goodavailability) all_check &= (available >= n_copies) if not all_check: break if all_check: print "\t\t",wfo.name,"can go staged" wfo.status = 'staged' session.commit() else: print "\t",wfo.name,"can wait a bit more" return for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed']: print wfo.name,"is",wfi.request['RequestStatus'] wfi.status='away' session.commit() continue _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): done_by_input[dataset] = {} completion_by_input[dataset] = {} print wfo.name,"needs",dataset ## this loop is very expensive and will not function at some point. ## transfer objects should probably be deleted as some point for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': print "\t",transfer.phedexid,"is staging for",tr_wf.name skip=False if skip: print "setting",transfer.phedexid,"to negative value" transfer.phedexid = -transfer.phedexid session.commit() continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion if str(transfer.phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done print "read",transfer.phedexid,"from cache" checks = cached_transfer_statuses[str(transfer.phedexid)] else: checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done ## for those that are in staging, and the destination site is in drain #if not done and tr_wf.status == 'staging': if done: ## transfer.status = 'done' print transfer.phedexid,"is done" cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) else: print transfer.phedexid,"not finished" pprint.pprint( checks ) open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) missing_in_action = defaultdict(list) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) else: print "would send",wf.name,"back to considered" sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) continue #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) #if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it #print "presence of",dsname,"does not matter anymore" #print "\t",done_by_input[dsname] #print "\t",[wf.status for wf in using_wfos] #print "\tneeds",need_sites continue ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print "incomplete",dsname lost = findLostBlocks(url, dsname) lost_names = [item['name'] for item in lost] try: known_lost = json.loads(open('lost_blocks_datasets.json').read()) except: print "enable to get the known_lost from local json file" known_lost = [] if lost: print "We have lost",len(lost),"blocks",lost_names #print json.dumps( lost , indent=2 ) if lost and not dsname in known_lost: ## make a deeper investigation of the block location to see whether it's really no-where no-where sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 )) known_lost.append(dsname) rr= open('lost_blocks_datasets.json','w') rr.write( json.dumps( known_lost, indent=2)) rr.close() ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got print "\tmissing",missings missing_in_action[dsname].extend( missings ) open('/afs/cern.ch/user/c/cmst2/www/unified/incomplete_transfers.json','w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) print "Going further and make a report of stuck transfers" datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" really_stuck_dataset = set() for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>7 and rate<0.0004: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print report open('/afs/cern.ch/user/c/cmst2/www/unified/stuck_transfers.json','w').write( json.dumps(dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]), indent=2) ) open('/afs/cern.ch/user/c/cmst2/www/unified/logs/incomplete_transfers.log','w').write( report )