def cachor(spec=None): if duplicateLock(silent=True): print "currently running" return try: all_checks = json.loads(open('cached_transfer_statuses.json').read()) except: all_checks = {} all_transfers = [transfer for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all()] random.shuffle( all_transfers ) existing = all_checks.keys() ## strng keys new = (set([str(transfer.phedexid) for transfer in all_transfers]) - set(existing)) print len(new),"transfers not look out at all, will do those first" if spec: new = [spec] for transfer in all_transfers: if new and str(transfer.phedexid)!=sorted(new)[0]: continue print "running the check on",transfer.phedexid new_check = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if new_check : all_checks[str(transfer.phedexid)] = new_check else: print "withouth an update, we are in deep shit" sendLog('cachor','Failed transfer status check on %s'% transfer.phedexid, level='critical') #do only one break for pid in sorted(all_checks.keys()): if not all_checks[pid]: all_checks.pop(pid) open('cached_transfer_statuses.json','w').write(json.dumps( all_checks , indent=2))
def run(self): try: self.close() except Exception as e: import traceback sendLog('closor','failed on %s due to %s and %s'%( self.wfo.name, str(e), traceback.format_exc()), level='critical') self.failed = True
def cachor(spec=None): if duplicateLock(silent=True): print "currently running" return try: all_checks = json.loads(open('%s/cached_transfer_statuses.json'%base_eos_dir).read()) except: all_checks = {} ## pop all that are now in inactive for phedexid in all_checks.keys(): transfers = session.query(TransferImp).filter(TransferImp.phedexid==int(phedexid)).filter(TransferImp.active==True).all() if not transfers: print phedexid,"does not look relevant to be in cache anymore. poping" print all_checks.pop( phedexid ) all_transfers = set() for imp in session.query(TransferImp).filter(TransferImp.active==True).all(): all_transfers.add( imp.phedexid ) all_transfers = list(all_transfers) random.shuffle( all_transfers ) existing = map(int,all_checks.keys()) ## strng keys new = (set(all_transfers) - set(existing)) print len(new),"transfers not look out at all, will do those first",new if spec: new = [int(spec)] new = list(new) random.shuffle( new ) #for transfer in all_transfers: for phedexid in all_transfers: #print phedexid if new and phedexid!=new[0]: continue print "running the check on",phedexid new_check = checkTransferStatus(url, phedexid, nocollapse=True) if new_check : print json.dumps( new_check ,indent=2) all_checks[str(phedexid)] = new_check else: print "withouth an update, we are in some trouble." sendLog('cachor','Failed transfer status check on %s'% phedexid, level='critical') #do only one break for pid in sorted(all_checks.keys()): if not all_checks[pid]: print "Removing empty report for",pid all_checks.pop(pid) open('%s/cached_transfer_statuses.json'%base_eos_dir,'w').write(json.dumps( all_checks , indent=2))
def cachor(spec=None): mlock = moduleLock(silent=True) if mlock(): print "currently running" return TS = transferStatuses() print sorted(TS.all()), "cached transfers" ## pop all that are now in inactive for phedexid in TS.all(): transfers = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.active == True).all() if not transfers: print phedexid, "does not look relevant to be in cache anymore. poping" TS.pop(phedexid) all_transfers = set() for imp in session.query(TransferImp).filter( TransferImp.active == True).all(): all_transfers.add(imp.phedexid) all_transfers = list(all_transfers) random.shuffle(all_transfers) existing = map(int, TS.all()) new = (set(all_transfers) - set(existing)) print len(new), "transfers not look out at all, will do those first", new if spec: new = [int(spec)] new = list(new) random.shuffle(new) #for transfer in all_transfers: for phedexid in all_transfers: #print phedexid if new and phedexid != new[0]: continue print "running the check on", phedexid new_check = checkTransferStatus(url, phedexid, nocollapse=True) if new_check: print json.dumps(new_check, indent=2) TS.add(phedexid, new_check) else: print "withouth an update, we are in some trouble." sendLog('cachor', 'Failed transfer status check on %s' % phedexid, level='critical') #do only one break
def cachor(spec=None): if duplicateLock(silent=True): print "currently running" return try: all_checks = json.loads(open('cached_transfer_statuses.json').read()) except: all_checks = {} ## pop all that are now in inactive for phedexid in all_checks.keys(): transfers = session.query(TransferImp).filter(TransferImp.phedexid==int(phedexid)).filter(TransferImp.active==True).all() if not transfers: print phedexid,"does not look relevant to be in cache anymore. poping" print all_checks.pop( phedexid ) #all_transfers = [transfer for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all()] all_transfers = list(set([imp.phedexid for imp in session.query(TransferImp).filter(TransferImp.active==True).all()])) random.shuffle( all_transfers ) existing = map(int,all_checks.keys()) ## strng keys new = (set(all_transfers) - set(existing)) print len(new),"transfers not look out at all, will do those first",new if spec: new = [int(spec)] #for transfer in all_transfers: for phedexid in all_transfers: #print phedexid if new and phedexid!=sorted(new)[0]: continue print "running the check on",phedexid new_check = checkTransferStatus(url, phedexid, nocollapse=True) if new_check : print json.dumps( new_check ,indent=2) all_checks[str(phedexid)] = new_check else: print "withouth an update, we are in some trouble." sendLog('cachor','Failed transfer status check on %s'% phedexid, level='critical') #do only one break for pid in sorted(all_checks.keys()): if not all_checks[pid]: print "Removing empty report for",pid all_checks.pop(pid) open('cached_transfer_statuses.json','w').write(json.dumps( all_checks , indent=2))
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock() and not options.go: return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) runnings = session.query(Workflow).filter(Workflow.status == 'away').all() standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ## intersect with what is actually in completed status in request manager now all_completed = set(getWorkflows(url, 'completed' )) wfs=[] if options.strict: ## the one which were running and now have completed print "strict option is on: checking workflows that freshly completed" wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings)) if options.update: print "update option is on: checking workflows that have not completed yet" wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings)) if options.clear: print "clear option is on: checking workflows that are ready to toggle closed-out" wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings)) if options.review: print "review option is on: checking the workflows that needed intervention" wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings)) ## what is left out are the wf which were running and ended up aborted/failed/... custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) if use_mcm else None def get_campaign(output, wfi): ## this should be a perfect matching of output->task->campaign campaign = None era = None wf_campaign = None if 'Campaign' in wfi.request: wf_campaign = wfi.request['Campaign'] try: era = output.split('/')[2].split('-')[0] except: era = None if wfi.isRelval(): campaign = wf_campaign else: campaign = era if era else wf_campaign return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] actors = UC.get('allowed_bypass') for bypassor,email in actors: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: extending = json.loads(open(holding_file).read()) print bypassor,"is holding",extending holdings.extend( extending ) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in actors: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') #if forcings: # sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) in_manual = 0 ## now you have a record of what file was invalidated globally from TT TMDB_invalid = dataCache.get('file_invalidation') #try: # TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))]) # TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid) # print len(TMDB_invalid),"globally invalidated files" #except Exception as e: # print "TMDB not fetched" # print str(e) # TMDB_invalid = [] print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if options.limit: max_per_round=options.limit if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) time_point("Starting with %s"% wfo.name) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM')) campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] true_familly = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['PrepID'] != wfi.request['PrepID'] : continue #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical') acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue true_familly.append( member['RequestName'] ) #try: # parse_one(url, member['RequestName']) #except: # print "Could not make error report for",member['RequestName'] if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical') time_point("checked workflow familly", sub_lap=True) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} events_per_lumi = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False time_point("execpted statistics", sub_lap=True) for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100 percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) default_pass = UC.get('default_fraction_pass') fractions_pass[output] = default_pass c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: if type(CI.campaigns[c]['fractionpass']) == dict: tier = output.split('/')[-1] priority = str(wfi.request['RequestPriority']) ## defined per tier fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass) if tier in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier] if priority in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority] else: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical') #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**']) ## do not bypass for now, until Alan understands why we are loosing ACDC docs bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False time_point("checked output size", sub_lap=True) ## correct lumi < 300 event per lumi #for output in wfi.request['OutputDatasets']: #events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) time_point("checked dataset presence", sub_lap=True) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] time_point("checked custodiality", sub_lap=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) time_point("checked phedex count", sub_lap=True) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" group = None if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]: group = CI.campaigns[campaign]['phedex_group'] print "using group",group,"for replica" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit") _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if custodial and size_worht_going_to_ddm > tape_size_limit: print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit)) custodial = None if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output)) custodials[custodial].append( output ) if group: custodials[custodial][-1]+='@%s'%group ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False time_point("determined tape location", sub_lap=True) ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) time_point("dbs file count", sub_lap=True) if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n" mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n" mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n" mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n" wfi.sendLog('checkor',mismatch_notice) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated), "\n".join(were_invalidated)), level='critical') dbs3Client.setFileStatus( were_invalidated, newstatus=0 ) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False time_point("checked file count", sub_lap=True) fraction_invalid = 0.20 if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} files_per_rl = {} for output in wfi.request['OutputDatasets']: duplications[output] = "skiped" files_per_rl[output] = "skiped" time_point("checked invalidation", sub_lap=True) if (is_closing or bypass_checks) and (not options.ignoreduplicates): print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except: try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except Exception as e: wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output)) sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical') is_closing=False if is_closing and any(duplications.values()) and not options.ignoreduplicates: duplicate_notice = "" duplicate_notice += "%s has duplicates\n"%wfo.name duplicate_notice += json.dumps( duplications,indent=2) duplicate_notice += '\n' duplicate_notice += json.dumps( files_per_rl, indent=2) wfi.sendLog('checkor',duplicate_notice) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False time_point("checked duplicates", sub_lap=True) time_point("done with %s"%wfo.name) ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] #rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) rec['familly'] = true_familly now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## make the lumi summary if wfi.request['RequestType'] == 'ReReco': try: os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID'])) os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID'])) wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID'])) except Exception as e: print str(e) ## make the error report ## and move on if is_closing: ## toggle status to closed-out in request manager wfi.sendLog('checkor',"setting %s closed-out"% wfo.name) if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: if not 'custodial' in assistance_tags or wfi.isRelval(): ## do only the report for those for member in acdc+acdc_inactive+[wfo.name]: try: parse_one(url, member) except: print "Could not make error report for",member ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that had ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') in_manual += 1 if 'recovery' in assistance_tags and 'manual' in assistance_tags: ## this is likely because something bad is happening, so leave it to manual assistance_tags = assistance_tags - set(['recovery']) assistance_tags.add('manual') in_manual += 1 ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name) ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) perflink = '%s/report/%s'%(unified_url,wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status)) session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec and in_manual!=0: sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: items_at = defaultdict(set) for i in custodials[site]: item, group = i.split('@') if '@' in i else (i,'DataOps') items_at[group].add( item ) for group,items in items_at.items(): print ','.join(items),'=>',site,'@',group if not options.test: result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group) print result print "File Invalidation" print invalidations
pileup_location = None for secondary in wqe['PileupData']: if pileup_location == None: pileup_location = wqe['PileupData'][secondary] else: pileup_location = list(set(pileup_location) & set(wqe['PileupData'][secondary])) #print pileup_location,"secondary" ## wqe site whitelist in terms of SE #swl = [si.CE_to_SE(s) for s in wl] swl = si.CE_to_SEs(wl) print "WQE SE whitelist",sorted(swl) if not swl: sendLog('GQ',"There is no site at which the workflow %s can run Was provided with %s"%(wf['RequestName'], ','.join(wqe['SiteWhitelist'])), level='critical') wfi.sendLog('GQ',"There is not site at which the workflow can run. Was provided with %s"%( ','.join(wqe['SiteWhitelist']))) continue not_processable = set() wqe_ce = None for b in wqe['Inputs']: ## what the global queue thinks about the block location wqe_se = [si.CE_to_SE(s) for s in wqe['Inputs'][b]] if not '#' in b: print "acdc doc input",b ## this would be an ACDC document input #retrieve it and check data file location ? original = workflowInfo(url, wfi.request['OriginalRequestName'])
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name ) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'%( batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append( Output( datasetname = out )) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) self.all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: print "dealing with",out if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: print "setting valid" results.append(setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 1 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" status = pass_to_dynamo( [out], N = n_copies, sites=destinations if destinations else None, group = group_spec if group_spec else None) results.append( status ) if status == True: wfi.sendLog('closor','%s is send to dynamo in %s copies %s %s'%( out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.", level='critical') wfi.sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.") else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace('announce','announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") self.held.add( wfo.name )
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def equalizor(url , specific = None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US','DE','IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s,m,r,"lacking pressure" return True else: print s,m,r,"pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ] use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True #if options.augment : use_T0 = True use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True #if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: mapping['T2_CH_CERN'].append('T0_CH_CERN') #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT','DE','UK']: mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb]) ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read()) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) ## remove add-hoc sites from overflow mapping prevent_sites = ['T2_US_Purdue'] for prevent in prevent_sites: if prevent in mapping: mapping.pop( prevent ) for src in mapping: for prevent in prevent_sites: if prevent in mapping[src]: mapping[src].remove( prevent ) ## create the reverse mapping for the condor module for site,fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print "Direct mapping : site => overflow" print json.dumps( mapping, indent=2) print "Reverse mapping : dest <= from origin" print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle( wfi , task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0,0) if not task_name in gmon: return (0,0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action( wfi, task, min_idled = 100, pressure = 0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle( wfi, task_name) go = True if not idled and not running : go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getPerf( task ): task = task.split('/')[1]+'/'+task.split('/')[-1] try: u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task print u perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read()) except Exception as e: print str(e) return (None,None) buckets = perf_data['aggregations']["2"]['buckets'] s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_m = sum( bucket['doc_count'] for bucket in buckets) m_m = max( bucket['key'] for bucket in buckets) if buckets else None b_m = None if w_m > 100: b_m = m_m try: perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read()) except Exception as e: print str(e) return (b_m,None) buckets = perf_data['aggregations']["2"]['buckets'] s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_t = sum( bucket['doc_count'] for bucket in buckets) m_t = max( bucket['key'] for bucket in buckets) if buckets else None b_t = None if w_t > 100: b_t = m_t return (b_m,b_t) def getcampaign( task ): taskname = task.pathName.split('/')[-1] if hasattr( task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-')>=1: return taskname.split('-')[1] else: return None def close( interface ): open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime()))) interface = { 'reversed_mapping' : reversed_mapping, 'modifications' : {} } if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping",specific interface['modifications'].pop(specific) close( interface ) return PU_locations = {} PU_overflow = {} LHE_overflow = {} tune_performance = [] pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass stay_within_site_whitelist = False specific_task=None if specific and ":" in specific: specific,specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() performance = {} no_routing = [ ] random.shuffle( wfs ) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d : d['RequestName']==wfo.name, workflows) if not cached : continue wfi = workflowInfo(url, wfo.name, request = cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append( (task, getcampaign(task) ) ) _,_,_,sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide=True def overide_from_agent( wfi, needs_overide): bad_agents = []#'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running','Acquired'] if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task,(task,campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign tune = CI.get(campaign,'tune',options.tune) if tune and not campaign in tune_performance: tune_performance.append( campaign ) overflow = CI.get(campaign,'overflow',{}) if overflow: if "PU" in overflow and not campaign in PU_overflow: PU_overflow[campaign] = copy.deepcopy(overflow['PU']) print "adding",campaign,"to PU overflow rules" if "LHE" in overflow and not campaign in LHE_overflow: print "adding",campaign,"to light input overflow rules" site_list = overflow['LHE']['site_list'] LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) ) ### get the task performance, for further massaging. if campaign in tune_performance or options.tune: print "performance",task.taskType,task.pathName if task.taskType in ['Processing','Production']: set_memory,set_time = getPerf( task.pathName ) #print "Performance %s GB %s min"%( set_memory,set_time) wfi.sendLog('equalizor','Performance tuning to %s GB %s min'%( set_memory,set_time)) ## get values from gmwsmon # massage the values : 95% percentile performance[task.pathName] = {} if set_memory: performance[task.pathName]['memory']=set_memory if set_time and False: performance[task.pathName]['time'] = set_time ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent( wfi, needs_overide) extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] ))) if stay_within_site_whitelist: extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites)) if extend_to and needs or needs_overide: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name, wfo.name, running, idled , json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist'])))) altered_tasks.add( task.pathName ) else: wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled)) ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence( url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] PU_locations[s] = one_secondary_locations print "secondary is at",sorted(PU_locations[s]) secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations ## we should add all sites that hold the secondary input if any ### given that we have the secondary location available, it is not necessary to use the add-hoc list ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) : needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set(wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[task_name]: site_in_use = set(gmon[task_name]['Sites']) print "removing",sorted(site_in_use) ## that determines where you want to run in addition augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) else: print "no existing running site" augment_by = list(original_site_in_use) needs_overide = overide_from_agent( wfi, needs_overide) if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name, running, idled, json.dumps( sorted(augment_by), indent=2 ))) else: print task_name,"of",wfo.name,"running",running,"and pending",idled ### overflow the skims back to multi-core if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name, running, idled, json.dumps( sorted(original_swl), indent=2 ))) if options.augment: print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" ) print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT)) if i_task==0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True #needs = True good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles'])) good_type &= not read_lhe if not good_type and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]: modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) interface['modifications'].update( modifications ) ### manage the number of core and job resizing interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}} interface['resizes'] = ['RunIISpring16DR80'] ### manage the modification of the memory and target time interface['time'] = defaultdict(list) interface['memory'] = defaultdict(list) max_N_mem = 10 max_N_time = 10 ## discretize the memory to 10 at most values mems = set([o['memory'] for t,o in performance.items() if 'memory' in o]) times = set([o['time'] for t,o in performance.items() if 'time' in o]) if len(mems)>max_N_mem: mem_step = int((max(mems) - min(mems))/ float(max_N_mem)) for t in performance: if not 'memory' in performance[t]: continue (m,r) = divmod(performance[t]['memory'], mem_step) performance[t]['memory'] = (m+1)*mem_step if len(times)>max_N_time: time_step = int((max(times) - min(times))/float(max_N_time)) for t in performance: if not 'time' in performance[t]: continue (m,r) = divmod(performance[t]['time'], time_step) performance[t]['time'] = (m+1)*time_step for t,o in performance.items(): if 'time' in o: interface['time'][str(o['time'])] .append( t ) if 'memory' in o: interface['memory'][str(o['memory'])].append( t ) ## close and save close( interface )
def batchor(url): UC = unifiedConfiguration() SI = global_SI() CI = campaignInfo() BI = batchInfo() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain')) wfs = filter( lambda r: r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r: r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:", wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add(wf['PrepID']) for wf in hi_wfs: print "HI Relval:", wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add(wf['PrepID']) default_setup = { "go": True, "parameters": { "SiteWhitelist": ["T1_US_FNAL"], "MergedLFNBase": "/store/relval", "Team": "relval", "NonCustodialGroup": "RelVal" }, "custodial_override": "notape", "phedex_group": "RelVal", "lumisize": -1, "fractionpass": 0.0, "maxcopies": 1 } default_hi_setup = copy.deepcopy(default_setup) add_on = {} relval_routing = UC.get('relval_routing') def pick_one_site(p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len( p["parameters"]["SiteWhitelist"]) > 1: choose_from = list( set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice(choose_from) print "picked", picked, "from", choose_from p["parameters"]["SiteWhitelist"] = [picked] batches = BI.all() for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_setup) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword", key print "with", augment_with setup = deep_update(setup, augment_with) pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_campaign[campaign]) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_hi_setup) possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"]) hi_site = random.choice(list(possible_sites)) setup["parameters"]["SiteWhitelist"] = [hi_site] pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the HI relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_hi_campaign[campaign]) ## only new campaigns in announcement for new_campaign in list( set(add_on.keys()) - set(CI.all(c_type='relval'))): ## this is new, and can be announced as such print new_campaign, "is new stuff" subject = "Request of RelVal samples batch %s" % new_campaign text = """Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message""" % ( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor', text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in CI.all(c_type='relval'): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) if not all_in_batch: continue is_batch_done = all( map( lambda s: not s in [ 'completed', 'force-complete', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [wf['RequestStatus'] for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable CI.pop(old_campaign) ## or just drop it all together ? BI.pop(old_campaign) print "batch", old_campaign, " configuration was removed" ## merge all anyways CI.update(add_on, c_type='relval')
def batchor( url ): UC = unifiedConfiguration() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs = getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add( wf['RequestName'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add( wf['RequestName'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial" : "T1_US_FNAL_MSS", "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} batches = json.loads( open('batches.json').read() ) for campaign in by_campaign: ## get a bunch of information setup = copy.deepcopy( default_setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] )) for campaign in by_hi_campaign: ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"]) setup["parameters"]["SiteWhitelist"]=[ hi_site ] add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] )) open('batches.json','w').write( json.dumps( batches , indent=2 ) ) ## open the campaign configuration campaigns = json.loads( open('campaigns.relval.json').read() ) ## protect for overwriting ?? for new_campaign in list(set(add_on.keys())-set(campaigns.keys())): ## this is new, and can be announced as such print new_campaign,"is new stuff" workflows = by_campaign[new_campaign] requester = list(set([wf.split('_')[0] for wf in workflows])) subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Requestor: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, ', '.join(requester), new_campaign, #'\n'.join( sorted(workflows) ) ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor',text, level='critical') ## merge all anyways campaigns.update( add_on ) ## write it out for posterity open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2)) ## read back rread = json.loads(open('campaigns.json.updated').read()) os.system('mv campaigns.json.updated campaigns.relval.json')
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
def spawn_harvesting(url, wfi , in_full): #SI = siteInfo() SI = global_SI() all_OK = {} requests = [] outputs = wfi.request['OutputDatasets'] if ('EnableHarvesting' in wfi.request and wfi.request['EnableHarvesting']) or ('DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']): if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('screwed up wl cache','%s wl cache is bad'%(wfi.request['RequestName'])) all_OK['fake'] = False return all_OK,requests wfi = workflowInfo(url, wfi.request['RequestName']) dqms = [out for out in outputs if '/DQM' in out] if not all([in_full[dqm_input] for dqm_input in dqms]): wfi.sendLog('closor',"will not be able to assign the harvesting: holding up") for dqm_input in dqms: all_OK[dqm_input] = False ## raise the subscription to high priority sites = set(wfi.request['NonCustodialSites']) for site in sites: res = updateSubscription(url, site, dqm_input, priority='high') print "increased priority",res return all_OK,requests for dqm_input in dqms: ## handle it properly harvesting_schema = { 'Requestor': os.getenv('USER'), 'RequestType' : 'DQMHarvest', 'Group' : 'DATAOPS' } copy_over = [ 'AcquisitionEra', 'ProcessingString', 'DQMUploadUrl', 'CMSSWVersion', 'CouchDBName', 'CouchWorkloadDBName', 'CouchURL', 'DbsUrl', 'inputMode', 'DQMConfigCacheID', 'OpenRunningTimeout', 'ScramArch', 'CMSSWVersion', 'Campaign', 'Memory', #dummy 'SizePerEvent', #dummy 'GlobalTag', #dummy ] for item in copy_over: if item in wfi.request: harvesting_schema[item] = copy.deepcopy(wfi.request[item]) else: print item,"is not in initial schema" harvesting_schema['InputDataset'] = dqm_input harvesting_schema['TimePerEvent'] = 1 harvesting_schema['PrepID'] = 'Harvest-'+wfi.request['PrepID'] if len(wfi.request['RequestString'])>60: wfi.request['RequestString']= wfi.request['RequestString'][:60] print "truncating request string",wfi.request['RequestString'] harvesting_schema['RequestString'] = 'HARVEST-'+wfi.request['RequestString'] harvesting_schema['DQMHarvestUnit'] = 'byRun' harvesting_schema['ConfigCacheUrl'] = harvesting_schema['CouchURL'] ## uhm, how stupid is that ? harvesting_schema['RequestPriority'] = wfi.request['RequestPriority']*10 harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema) if not harvest_request: print "Error in making harvesting for",wfi.request['RequestName'] print "schema" print json.dumps( harvesting_schema, indent = 2) harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema) if not harvest_request: print "Error twice in harvesting for",wfi.request['RequestName'] print "schema" print json.dumps( harvesting_schema, indent = 2) if harvest_request: requests.append( harvest_request ) ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely data = reqMgrClient.setWorkflowApproved(url, harvest_request) print "created",harvest_request,"for harvesting of",dqm_input wfi.sendLog('closor',"created %s for harvesting of %s"%( harvest_request, dqm_input)) ## assign it directly team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : [SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites']], 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], 'execute' : True } if in_full[dqm_input]: print "using full copy at",in_full[dqm_input] parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]] else: print "cannot do anything if not having a full copy somewhere" all_OK[dqm_input]=False continue result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters) if not result: #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName'])) sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), level='critical') else: #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog('closor','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName'])) else: #print "could not make the harvesting for",wfo.name,"not announcing" wfi.sendLog('closor',"could not make the harvesting request") sendLog('closor',"could not make the harvesting request for %s"% wfi.request['RequestName'], level='critical') all_OK[dqm_input]=False return (all_OK, requests)
def completor(url, specific): mlock = moduleLock(silent=True) if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) safe_mode = False CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() JC = JIRAClient() if up.status.get('jira', False) else None wfs = [] wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all()) wfs.extend( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) ## just take it in random order so that not always the same is seen random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('completor', None) if max_per_round and not specific: wfs = wfs[:max_per_round] all_stuck = set() ## take into account what stagor was saying for itry in range(5): try: all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) break except: time.sleep(2) for itry in range(5): try: ## take into account the block that needed to be repositioned recently all_stuck.update([ b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json' % monitor_dir)) ]) break except: time.sleep(2) ## take into account all stuck block and dataset from transfer team all_stuck.update(getAllStuckDataset()) good_fractions = {} overdoing_fractions = {} truncate_fractions = {} timeout = {} campaign_injection_delay = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'truncate-complete' in CI.campaigns[c]: truncate_fractions[c] = CI.campaigns[c]['truncate-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] if 'injection-delay' in CI.campaigns[c]: campaign_injection_delay[c] = CI.campaigns[c]['injection-delay'] if 'overdoing-complete' in CI.campaigns[c]: overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete'] long_lasting = {} WI = wtcInfo() overrides = WI.getForce() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps(good_fractions, indent=2) print "can truncate complete on" print json.dumps(truncate_fractions, indent=2) print "can overide on" print json.dumps(overrides, indent=2) max_force = UC.get("max_force_complete") max_priority = UC.get("max_tail_priority") injection_delay_threshold = UC.get("injection_delay_threshold") injection_delay_priority = UC.get("injection_delay_priority") delay_priority_increase = UC.get("delay_priority_increase") default_fraction_overdoing = UC.get('default_fraction_overdoing') set_force_complete = set() # priority and time above which to fire a JIRA jira_priority_and_delays = { 110000: 21, 90000: 28, # 80000 : 60, #0 : 90 } for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at", wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip = False campaigns = wfi.getCampaigns() #if not any([c in good_fractions.keys() for c in campaigns]): skip=True #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True for user, spec in overrides.items(): if not spec: continue spec = filter(None, spec) if not wfi.request['RequestStatus'] in [ 'force-complete', 'completed' ]: if any(s in wfo.name for s in spec) or (wfo.name in spec) or any( pid in spec for pid in pids) or any(s in pids for s in spec): wfi = workflowInfo(url, wfo.name) forceComplete(url, wfi) skip = True wfi.notifyRequestor( "The workflow %s was force completed by request of %s" % (wfo.name, user), do_batch=False) wfi.sendLog( 'completor', '%s is asking for %s to be force complete' % (user, wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in [ 'acquired', 'running-open', 'running-closed' ]: continue ## until we can map the output to task ... output_per_task = wfi.getOutputPerTask( ) ## can use that one, and follow mapping good_fraction_per_out = {} good_fraction_nodelay_per_out = {} truncate_fraction_per_out = {} #allowed_delay_per_out = {} for task, outs in output_per_task.items(): task_campaign = wfi.getCampaignPerTask(task) for out in outs: good_fraction_per_out[out] = good_fractions.get( task_campaign, 1000.) good_fraction_nodelay_per_out[out] = overdoing_fractions.get( task_campaign, default_fraction_overdoing) truncate_fraction_per_out[out] = truncate_fractions.get( task_campaign, 1000.) #allowed_delay_per_out[out] = timeout.get(task_campaign, 14) #print "force at", json.dumps( good_fraction_per_out, indent=2) #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2) now = time.mktime(time.gmtime()) / (60 * 60 * 24.) priority_log = filter(lambda change: change['Priority'] == priority, wfi.request.get('PriorityTransition', [])) if not priority_log: print "\tHas no priority log" priority_delay = 0 else: then = max([change['UpdateTime'] for change in priority_log]) / (60. * 60. * 24.) priority_delay = now - then ## in days print "priority was set to", priority, priority_delay, "[days] ago" running_log = filter( lambda change: change["Status" ] in ["running-open", "running-closed"], wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" delay = 0 else: then = max([change['UpdateTime'] for change in running_log]) / (60. * 60. * 24.) delay = now - then ## in days #further check on delays cpuh = wfi.getComputingTime(unit='d') wfi.sendLog( 'completor', "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago" % (cpuh, delay, priority, priority_delay)) if priority_delay != 0 and priority_delay < delay: ## regardless when it started running, set the delay to when priority was changed last delay = priority_delay ## this is supposed to be the very initial request date, inherited from clones injection_delay = None original = wfi if 'OriginalRequestName' in original.request: ## go up the clone chain original = workflowInfo(url, original.request['OriginalRequestName']) injected_log = filter( lambda change: change["Status"] in ["assignment-approved"], original.request['RequestTransition']) if injected_log: injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.) injection_delay = now - injected_on delay_for_priority_increase = injection_delay #delay_for_priority_increase = delay (w, d) = divmod(delay, 7) print "\t" * int( w) + "Running since", delay, "[days] priority=", priority pop_a_jira = False ping_on_jira = 7 * (24 * 60 * 60) # 7 days for jp, jd in jira_priority_and_delays.items(): if priority >= jp and delay >= jd: pop_a_jira = True if pop_a_jira and JC: j, reopened, just_created = JC.create_or_last( prepid=wfi.request['PrepID'], priority=wfi.request['RequestPriority'], label='Late', reopen=True) last_time = JC.last_time(j) since_last_ping = time.mktime(time.gmtime()) - last_time if since_last_ping > ping_on_jira or just_created: j_comment = "Running since %.1f [days] at priority %d" % ( delay, priority) JC.comment(j.key, j_comment) if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority: quantized = 5000 ## quantize priority tail_cutting_priority = wfi.request['InitialPriority'] + int( (delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized tail_cutting_priority += 101 ## to signal it is from this mechanism tail_cutting_priority = min( 400000, tail_cutting_priority) ## never go above 400k priority tail_cutting_priority = max( tail_cutting_priority, priority) ## never go below the current value if priority < tail_cutting_priority: if max_priority: sendLog( 'completor', "%s Injected since %s [days] priority=%s, increasing to %s" % (wfo.name, delay_for_priority_increase, priority, tail_cutting_priority), level='critical') wfi.sendLog( 'completor', 'bumping priority to %d for being injected since %s' % (tail_cutting_priority, delay_for_priority_increase)) reqMgrClient.changePriorityWorkflow( url, wfo.name, tail_cutting_priority) max_priority -= 1 else: sendLog( 'completor', "%s Injected since %s [days] priority=%s, would like to increase to %s" % (wfo.name, delay_for_priority_increase, priority, tail_cutting_priority), level='critical') wfi.sendLog( 'completor', 'would like to bump priority to %d for being injected since %s' % (tail_cutting_priority, delay_for_priority_increase)) print "Could be changing the priority to higher value, but too many already were done" _, prim, _, _ = wfi.getIO() is_stuck = all_stuck & prim if is_stuck: wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck)) monitor_delay = 7 allowed_delay = max([timeout.get(c, 14) for c in campaigns]) monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early, just for the sake of not computing the completion fraction just now. # maybe this is fast enough that we can do it for all if delay <= monitor_delay: print "not enough time has passed yet" continue long_lasting[wfo.name] = { "delay": delay, "injection_delay": injection_delay } percent_completions = wfi.getCompletionFraction(caller='completor') if not percent_completions: sendLog('completor', '%s has no output at all' % wfo.name, level='critical') continue is_over_allowed_delay = (all([ percent_completions[out] >= good_fraction_per_out.get(out, 1000.) for out in percent_completions ]) and delay >= allowed_delay) is_over_truncation_delay = (is_stuck and (all([ percent_completions[out] >= truncate_fraction_per_out.get( out, 1000.) for out in percent_completions ])) and delay >= allowed_delay) is_over_completion = (all([ percent_completions[out] >= good_fraction_nodelay_per_out.get( out, 1000.) for out in percent_completions ])) if is_over_completion: wfi.sendLog( 'completor', "all is over completed %s\n %s" % (json.dumps(good_fraction_nodelay_per_out, indent=2), json.dumps(percent_completions, indent=2))) elif is_over_allowed_delay: wfi.sendLog( 'completor', "all is above %s \n%s" % (json.dumps(good_fraction_per_out, indent=2), json.dumps(percent_completions, indent=2))) elif is_over_truncation_delay: wfi.sendLog( 'completor', "all is above %s truncation level, and the input is stuck\n%s" % (json.dumps(truncate_fraction_per_out, indent=2), json.dumps(percent_completions, indent=2))) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions': percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog( 'completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s" % (json.dumps(percent_completions, indent=2), json.dumps(good_fraction_per_out, indent=2), json.dumps(truncate_fraction_per_out, indent=2), json.dumps(long_lasting[wfo.name]['agents'], indent=2))) continue #for output in percent_completions: # completions[output]['injected'] = then ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days ## find ACDCs that might be running if max_force > 0: print "going for force-complete of", wfo.name if not safe_mode: forceComplete(url, wfi) set_force_complete.add(wfo.name) wfi.sendLog('completor', 'going for force completing') wfi.notifyRequestor( "The workflow %s was force completed for running too long" % wfo.name) max_force -= 1 else: sendEmail( 'completor', 'The workflow %s is ready for force complete, but completor is in safe mode' % wfo.name) else: wfi.sendLog( 'completor', "too many completion this round, cannot force complete") if set_force_complete: sendLog( 'completor', 'The followings were set force-complete \n%s' % ('\n'.join(set_force_complete))) #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text = "These have been running for long" #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) eosFile('%s/longlasting.json' % monitor_dir, 'w').write(json.dumps(long_lasting, indent=2)).close() for wf, info in sorted(long_lasting.items(), key=lambda tp: tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days" % (wf, delay) if 'completion' in info: text += " %d%%" % (info['completion'] * 100) print text
url = reqmgr_url wfs = getWorkflows(url, 'assigned', details=True) now = time.mktime( time.gmtime()) for wf in wfs: assigned_log = filter(lambda change : change["Status"] in ["assigned"],wf['RequestTransition']) if assigned_log: then = assigned_log[-1]['UpdateTime'] since = (now-then)/float(1*24*60*60.) if since>1.: print "workflow",wf['RequestName'],"is assigned since",then," that is",since,"days" sendLog('GQ','The workflow %s has been assigned for %.2f days'%(wf['RequestName'], since), level='critical') may_have_one=set() may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('away')).all()]) may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()]) wfs = [] wfs.extend( getWorkflows(url, 'running-open', details=True)) wfs.extend( getWorkflows(url, 'running-closed', details=True)) wfs.extend( getWorkflows(url, 'completed', details=True)) may_have_one_too = set() for wf in wfs: if wf['RequestName'] in may_have_one: #print wf['RequestName'],"and familly" may_have_one_too.update( getWorkflowById(url, wf['PrepID']) )
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary)), level='critical') if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[ wfh.request['Campaign']]: primary_aaa = primary_aaa or CI.campaigns[ wfh.request['Campaign']]['primary_AAA'] secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[ wfh.request['Campaign']]: secondary_aaa = secondary_aaa or CI.campaigns[ wfh.request['Campaign']]['secondary_AAA'] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog( 'assignor', "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list( set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( 'assignor', 'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting' % wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( 'assignor', "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def transferor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read()) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get(prim) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s), wfi=wfh) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendEmail("no request in staging", "no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor', None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if secondary: if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced = False is_real = False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: print "could not get mcm batch announcement, assuming not real" return announced, is_real if not use_mcm: announced, is_real = False, True else: if wfh.request['RequestType'] in ['ReReco']: announced, is_real = True, True else: announced, is_real = check_mcm(wfo.name) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog( 'transferor', "It is too soon to start transfer: %3.2fH remaining" % (now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): ## lock everything flat NLI.lock(dataset) if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=wfh.request['LumiList']))) if blocks: print "Reading", len(blocks), "in block whitelist" can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be prim_destination = [ site for site in destinations.keys() if not site in prim_location ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "not counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter( Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Not counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The output is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies, but no destinations seems available" ) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = input_sizes[ prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if site in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size: all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)' % (sec, sec_size, site_se, SI.disk[site_se] * 1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(no_goes), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: priority = 'normal' cds = [ ds for ds in datasets + block_datasets if ds in max_priority ] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds] >= 90000 for ds in cds]): priority = 'high' elif all([max_priority[ds] < 80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter( Transfer.phedexid == -int(phedexid)).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def equalizor(url, specific=None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US', 'DE', 'IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s, m, r, "lacking pressure" return True else: print s, m, r, "pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [ fb for fb in SI.sites_ready if any([('_%s_' % (reg) in fb and fb != site and site_in_depletion(fb)) for reg in regions[region]]) ] use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True #if options.augment : use_T0 = True use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True #if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: mapping['T2_CH_CERN'].append('T0_CH_CERN') #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT', 'DE', 'UK']: mapping['T2_CH_CERN'].extend( [fb for fb in SI.sites_ready if '_%s_' % reg in fb]) ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads(open('%s/GQ.json' % monitor_dir).read()) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) ## remove add-hoc sites from overflow mapping prevent_sites = ['T2_US_Purdue'] for prevent in prevent_sites: if prevent in mapping: mapping.pop(prevent) for src in mapping: for prevent in prevent_sites: if prevent in mapping[src]: mapping[src].remove(prevent) ## create the reverse mapping for the condor module for site, fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print "Direct mapping : site => overflow" print json.dumps(mapping, indent=2) print "Reverse mapping : dest <= from origin" print json.dumps(reversed_mapping, indent=2) altered_tasks = set() def running_idle(wfi, task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0, 0) if not task_name in gmon: return (0, 0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action(wfi, task, min_idled=100, pressure=0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle(wfi, task_name) go = True if not idled and not running: go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getPerf(task): task = task.split('/')[1] + '/' + task.split('/')[-1] try: u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s' % task print u perf_data = json.loads(os.popen('curl -s --retry 5 %s' % u).read()) except Exception as e: print str(e) return (None, None) buckets = perf_data['aggregations']["2"]['buckets'] s_m = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets) w_m = sum(bucket['doc_count'] for bucket in buckets) m_m = max(bucket['key'] for bucket in buckets) if buckets else None b_m = None if w_m > 100: b_m = m_m try: perf_data = json.loads( os.popen( 'curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s' % task).read()) except Exception as e: print str(e) return (b_m, None) buckets = perf_data['aggregations']["2"]['buckets'] s_t = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets) w_t = sum(bucket['doc_count'] for bucket in buckets) m_t = max(bucket['key'] for bucket in buckets) if buckets else None b_t = None if w_t > 100: b_t = m_t return (b_m, b_t) def getcampaign(task): taskname = task.pathName.split('/')[-1] if hasattr(task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-') >= 1: return taskname.split('-')[1] else: return None def close(interface): open('%s/equalizor.json.new' % monitor_dir, 'w').write(json.dumps(interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json' % (monitor_dir, monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' % (monitor_dir, monitor_dir, time.mktime(time.gmtime()))) interface = {'reversed_mapping': reversed_mapping, 'modifications': {}} if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json' % monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping", specific interface['modifications'].pop(specific) close(interface) return PU_locations = {} PU_overflow = {} LHE_overflow = {} tune_performance = [] pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads( os.popen( 'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT' ).read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass stay_within_site_whitelist = False specific_task = None if specific and ":" in specific: specific, specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() performance = {} no_routing = [] random.shuffle(wfs) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d: d['RequestName'] == wfo.name, workflows) if not cached: continue wfi = workflowInfo(url, wfo.name, request=cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in [ 'running-open', 'running-closed' ] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append((task, getcampaign(task))) _, _, _, sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide = True def overide_from_agent(wfi, needs_overide): bad_agents = [] #'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running', 'Acquired'] if any([ agent in agents.get(wqs, {}).keys() for wqs, agent in itertools.product(wqss, bad_agents) ]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task, (task, campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign tune = CI.get(campaign, 'tune', options.tune) if tune and not campaign in tune_performance: tune_performance.append(campaign) overflow = CI.get(campaign, 'overflow', {}) if overflow: if "PU" in overflow and not campaign in PU_overflow: PU_overflow[campaign] = copy.deepcopy(overflow['PU']) print "adding", campaign, "to PU overflow rules" if "LHE" in overflow and not campaign in LHE_overflow: print "adding", campaign, "to light input overflow rules" site_list = overflow['LHE']['site_list'] LHE_overflow[campaign] = copy.deepcopy( getattr(SI, site_list)) ### get the task performance, for further massaging. if campaign in tune_performance or options.tune: print "performance", task.taskType, task.pathName if task.taskType in ['Processing', 'Production']: set_memory, set_time = getPerf(task.pathName) #print "Performance %s GB %s min"%( set_memory,set_time) wfi.sendLog( 'equalizor', 'Performance tuning to %s GB %s min' % (set_memory, set_time)) ## get values from gmwsmon # massage the values : 95% percentile performance[task.pathName] = {} if set_memory: performance[task.pathName]['memory'] = set_memory if set_time and False: performance[task.pathName]['time'] = set_time ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent(wfi, needs_overide) extend_to = list(set(copy.deepcopy( LHE_overflow[campaign]))) if stay_within_site_whitelist: extend_to = list( set(extend_to) & set(wfi.request['SiteWhitelist']) ) ## restrict to stupid-site-whitelist extend_to = list( set(extend_to) & set(SI.sites_ready + force_sites)) if extend_to and needs or needs_overide: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist": extend_to, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps( sorted(modifications[wfo.name][task.pathName] ['ReplaceSiteWhitelist'])))) altered_tasks.add(task.pathName) else: wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d' % (task_name, wfo.name, running, idled)) ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign][ 'force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence(url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] PU_locations[s] = one_secondary_locations print "secondary is at", sorted(PU_locations[s]) secondary_locations = set( [SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations ## we should add all sites that hold the secondary input if any ### given that we have the secondary location available, it is not necessary to use the add-hoc list ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) if any([ task.pathName.endswith(finish) for finish in ['_0', 'StepOneProc', 'Production'] ]): needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set( wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[ task_name]: site_in_use = set(gmon[task_name]['Sites']) print "removing", sorted(site_in_use) ## that determines where you want to run in addition augment_by = list((set(secondary_locations) - site_in_use) & original_site_in_use) else: print "no existing running site" augment_by = list(original_site_in_use) needs_overide = overide_from_agent(wfi, needs_overide) if augment_by and ( needs or needs_overide or force) and PU_overflow[campaign][ 'pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to", PU_overflow[campaign][ 'pending'], "for", PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist": augment_by, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps(sorted(augment_by), indent=2))) else: print task_name, "of", wfo.name, "running", running, "and pending", idled ### overflow the skims back to multi-core if campaign in ['Run2015D', 'Run2015C_25ns' ] and task.taskType == 'Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist': original_swl, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps(sorted(original_swl), indent=2))) if options.augment: print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [ 0, 1 ] and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: modifications[wfo.name][task.pathName][ "AddWhitelist"].append("T2_CH_CERN_HLT") print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T2_CH_CERN_HLT"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } wfi.sendLog( 'equalizor', 'adding the HLT in whitelist of %s to %d for %d' % (task.pathName, pending_HLT, max_HLT)) if i_task == 0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True #needs = True good_type = wfi.request['RequestType'] in [ 'MonteCarlo', 'MonteCarloFromGEN' ] read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles'])) good_type &= not read_lhe if not good_type and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][ task.pathName]["AddWhitelist"]: modifications[wfo.name][task.pathName][ "AddWhitelist"].append("T0_CH_CERN") wfi, sendLog( 'equalizor', 'adding the T0 for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) elif task.pathName in modifications[ wfo. name] and 'ReplaceSiteWhitelist' in modifications[ wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][ task.pathName]["ReplaceSiteWhitelist"]: modifications[wfo.name][task.pathName][ "ReplaceSiteWhitelist"].append("T0_CH_CERN") wfi, sendLog( 'equalizor', 'adding the T0 to replacement for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T0_CH_CERN"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } wfi, sendLog( 'equalizor', 'adding the T0 for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) interface['modifications'].update(modifications) ### manage the number of core and job resizing interface['cores'] = { 'T2_CH_CERN_HLT': { 'min': 4, 'max': 16 }, 'default': { 'min': 1, 'max': 4 } } interface['resizes'] = ['RunIISpring16DR80'] ### manage the modification of the memory and target time interface['time'] = defaultdict(list) interface['memory'] = defaultdict(list) max_N_mem = 10 max_N_time = 10 ## discretize the memory to 10 at most values mems = set([o['memory'] for t, o in performance.items() if 'memory' in o]) times = set([o['time'] for t, o in performance.items() if 'time' in o]) if len(mems) > max_N_mem: mem_step = int((max(mems) - min(mems)) / float(max_N_mem)) for t in performance: if not 'memory' in performance[t]: continue (m, r) = divmod(performance[t]['memory'], mem_step) performance[t]['memory'] = (m + 1) * mem_step if len(times) > max_N_time: time_step = int((max(times) - min(times)) / float(max_N_time)) for t in performance: if not 'time' in performance[t]: continue (m, r) = divmod(performance[t]['time'], time_step) performance[t]['time'] = (m + 1) * time_step for t, o in performance.items(): if 'time' in o: interface['time'][str(o['time'])].append(t) if 'memory' in o: interface['memory'][str(o['memory'])].append(t) ## close and save close(interface)
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def completor(url, specific): mlock = moduleLock(silent=True) if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) safe_mode = False CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() JC = JIRAClient() if up.status.get('jira',False) else None wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('completor',None) if max_per_round and not specific: wfs = wfs[:max_per_round] all_stuck = set() ## take into account what stagor was saying for itry in range(5): try: all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir))) break except: time.sleep(2) for itry in range(5): try: ## take into account the block that needed to be repositioned recently all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] ) break except: time.sleep(2) ## take into account all stuck block and dataset from transfer team all_stuck.update( getAllStuckDataset()) good_fractions = {} overdoing_fractions = {} truncate_fractions = {} timeout = {} campaign_injection_delay = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'truncate-complete' in CI.campaigns[c]: truncate_fractions[c] = CI.campaigns[c]['truncate-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] if 'injection-delay' in CI.campaigns[c]: campaign_injection_delay[c] = CI.campaigns[c]['injection-delay'] if 'overdoing-complete' in CI.campaigns[c]: overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete'] long_lasting = {} WI = wtcInfo() overrides = WI.getForce() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print "can truncate complete on" print json.dumps( truncate_fractions ,indent=2) print "can overide on" print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") max_priority = UC.get("max_tail_priority") injection_delay_threshold = UC.get("injection_delay_threshold") injection_delay_priority = UC.get("injection_delay_priority") delay_priority_increase = UC.get("delay_priority_increase") default_fraction_overdoing = UC.get('default_fraction_overdoing') set_force_complete = set() # priority and time above which to fire a JIRA jira_priority_and_delays = { 110000 : 21, 90000 : 28, # 80000 : 60, #0 : 90 } for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False campaigns = wfi.getCampaigns() #if not any([c in good_fractions.keys() for c in campaigns]): skip=True #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True for user,spec in overrides.items(): if not spec: continue spec = filter(None, spec) if not wfi.request['RequestStatus'] in ['force-complete', 'completed']: if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue ## until we can map the output to task ... output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping good_fraction_per_out = {} good_fraction_nodelay_per_out = {} truncate_fraction_per_out = {} #allowed_delay_per_out = {} for task,outs in output_per_task.items(): task_campaign = wfi.getCampaignPerTask( task ) for out in outs: good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.) good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing) truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.) #allowed_delay_per_out[out] = timeout.get(task_campaign, 14) #print "force at", json.dumps( good_fraction_per_out, indent=2) #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2) now = time.mktime(time.gmtime()) / (60*60*24.) priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[])) if not priority_log: print "\tHas no priority log" priority_delay = 0 else: then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.) priority_delay = now - then ## in days print "priority was set to",priority,priority_delay,"[days] ago" running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" delay = 0 else: then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.) delay = now - then ## in days #further check on delays cpuh = wfi.getComputingTime(unit='d') wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay)) if priority_delay!=0 and priority_delay < delay: ## regardless when it started running, set the delay to when priority was changed last delay = priority_delay ## this is supposed to be the very initial request date, inherited from clones injection_delay = None original = wfi if 'OriginalRequestName' in original.request: ## go up the clone chain original = workflowInfo(url, original.request['OriginalRequestName']) injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition']) if injected_log: injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.) injection_delay = now - injected_on delay_for_priority_increase = injection_delay #delay_for_priority_increase = delay (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority pop_a_jira = False ping_on_jira = 7 *(24*60*60) # 7 days for jp,jd in jira_priority_and_delays.items(): if priority >= jp and delay >= jd: pop_a_jira = True if pop_a_jira and JC: j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'], priority = wfi.request['RequestPriority'], label = 'Late', reopen = True) last_time = JC.last_time( j ) since_last_ping = time.mktime(time.gmtime()) - last_time if since_last_ping > ping_on_jira or just_created: j_comment = "Running since %.1f [days] at priority %d"%( delay, priority) JC.comment(j.key, j_comment) if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority: quantized = 5000 ## quantize priority tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized tail_cutting_priority += 101 ## to signal it is from this mechanism tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value if priority < tail_cutting_priority: if max_priority: sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical') wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase)) reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority) max_priority-=1 else: sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical') wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase)) print "Could be changing the priority to higher value, but too many already were done" _,prim,_,_ = wfi.getIO() is_stuck = all_stuck & prim if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck)) monitor_delay = 7 allowed_delay = max([timeout.get(c,14) for c in campaigns]) monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early, just for the sake of not computing the completion fraction just now. # maybe this is fast enough that we can do it for all if delay <= monitor_delay: print "not enough time has passed yet" continue long_lasting[wfo.name] = { "delay" : delay, "injection_delay" : injection_delay } percent_completions = wfi.getCompletionFraction(caller='completor') if not percent_completions: sendLog('completor','%s has no output at all'% wfo.name, level='critical') continue is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay) is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay) is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions])) if is_over_completion: wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) )) elif is_over_allowed_delay: wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) )) elif is_over_truncation_delay: wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ), json.dumps( percent_completions, indent=2 ) ) ) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), json.dumps(good_fraction_per_out, indent=2), json.dumps( truncate_fraction_per_out, indent=2), json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue #for output in percent_completions: # completions[output]['injected'] = then ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days ## find ACDCs that might be running if max_force>0: print "going for force-complete of",wfo.name if not safe_mode: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name) else: wfi.sendLog('completor',"too many completion this round, cannot force complete") if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close() for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) print text
def stagor(url, specific=None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() TS = transferStatuses() cached_transfer_statuses = TS.content() transfer_statuses = {} done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads( eosRead('%s/lost_blocks_datasets.json' % monitor_dir)) lost_files = json.loads( eosRead('%s/lost_files_datasets.json' % monitor_dir)) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys() + lost_files.keys()): b, f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset, "has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset, "has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] def time_point(label="", sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s" % (label, nows) print "Since start: %s [s]" % (now - time_point.start) if sub_lap: print "Sub Lap : %s [s]" % (now - time_point.sub_lap) time_point.sub_lap = now else: print "Lap : %s [s]" % (now - time_point.lap) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime( time.gmtime()) time_point("Check cached transfer") ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) needs_by_priority = defaultdict(list) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in [ 'running-open', 'running-closed', 'completed', 'assigned', 'acquired' ]: wfi.sendLog('stagor', "is in status %s" % wfi.request['RequestStatus']) wfo.status = 'away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too ## that usually happens for relvals if wfi.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfi.isRelval(): wfo.status = 'forget' session.commit() continue else: print wfo.name, "is", wfi.request['RequestStatus'] #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) sendLog("stagor", "%s is in %s, set away" % (wfo.name, wfi.request['RequestStatus']), level='critical') wfo.status = 'away' session.commit() continue wfois.append((wfo, wfi)) _, primaries, _, secondaries = wfi.getIO() for dataset in list(primaries) + list(secondaries): needs[wfo.name].append(dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} needs_by_priority[wfi.request['RequestPriority']].append(dataset) wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset)) time_point("Check staging workflows") open('%s/dataset_requirements.json' % monitor_dir, 'w').write(json.dumps(needs, indent=2)) for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio])) open('%s/dataset_priorities.json' % monitor_dir, 'w').write(json.dumps(needs_by_priority, indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## first check if anything is inactive all_actives = set([ transfer.phedexid for transfer in session.query(TransferImp).filter( TransferImp.active).all() ]) for active_phedexid in all_actives: skip = True transfers_phedexid = session.query(TransferImp).filter( TransferImp.phedexid == active_phedexid).all() for imp in transfers_phedexid: if imp.workflow.status == 'staging': skip = False sendLog( 'stagor', "\t%s is staging for %s" % (imp.phedexid, imp.workflow.name)) if skip: sendLog('stagor', "setting %s inactive" % active_phedexid) for imp in transfers_phedexid: imp.active = False session.commit() all_actives = sorted( set([ transfer.phedexid for transfer in session.query( TransferImp).filter(TransferImp.active).all() ])) for phedexid in all_actives: if specific: continue ## check on transfer completion not_cached = False if phedexid in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor', "read %s from cache" % phedexid) checks = cached_transfer_statuses[phedexid] else: ## I actually would like to avoid that all I can sendLog('stagor', 'Performing spurious transfer check on %s' % phedexid, level='critical') checks = checkTransferStatus(url, phedexid, nocollapse=True) try: print json.dumps(checks, indent=2) except: print checks if not checks: ## this is going to bias quite heavily the rest of the code. we should abort here #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical') #return False sendLog( 'stagor', 'Stagor has got a skewed input from checkTransferStatus', level='critical') checks = {} pass else: TS.add(phedexid, checks) time_point("Check transfer status %s" % phedexid, sub_lap=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname] = {} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][phedexid] = all( map(lambda i: i >= good_enough, checks[dsname].values())) completion_by_input[dsname][phedexid] = checks[dsname].values() if checks: sendLog( 'stagor', "Checks for %s are %s" % (phedexid, [node.values() for node in checks.values()])) done = all( map( lambda i: i >= good_enough, list( itertools.chain.from_iterable( [node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? if not_cached: print "Transfer status was not cached" else: print "ERROR with the scubscriptions API of ", phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False transfers_phedexid = session.query(TransferImp).filter( TransferImp.phedexid == phedexid).all() for imp in transfers_phedexid: tr_wf = imp.workflow if tr_wf: # and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {} done_by_wf_id[tr_wf.id][phedexid] = done if done: imp.active = False session.commit() for ds in checks: for s, v in checks[ds].items(): dataset_endpoints[ds].add(s) if done: sendLog('stagor', "%s is done" % phedexid) TS.add(phedexid, checks) else: sendLog( 'stagor', "%s is not finished %s" % (phedexid, pprint.pformat(checks))) ##pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [ SI.SE_to_CE(s) for s, v in checks[ds].items() if v < good_enough ] sites_incomplete_down = [ s for s in sites_incomplete if not s in SI.sites_ready ] ## no space means no transfer should go there : NO, it does not work in the long run #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))] if sites_incomplete_down: sendLog( 'stagor', "%s are in downtime, while waiting for %s to get there" % (",".join(sites_incomplete_down), ds)) endpoint_in_downtime[ds].update(sites_incomplete_down) if sites_incomplete: endpoint_incompleted[ds].update(sites_incomplete) time_point("Check on-going transfers") print "End points" for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps(dataset_endpoints, indent=2) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) print json.dumps(endpoint_in_downtime, indent=2) print "End point incomplete in down time" for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k]) print json.dumps(endpoint_incompleted, indent=2) #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) eosFile('%s/transfer_statuses.json' % monitor_dir, 'w').write(json.dumps(TS.content(), indent=2)).close() eosFile('%s/dataset_endpoints.json' % monitor_dir, 'w').write(json.dumps(dataset_endpoints, indent=2)).close() already_stuck = json.loads( eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys() already_stuck.extend(getAllStuckDataset()) missing_in_action = defaultdict(list) print "-" * 10, "Checking on workflows in staging", "-" * 10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda: defaultdict(float)) presence_cache = defaultdict(dict) time_point("Preparing for more") for wfo, wfi in wfois: print "#" * 30 time_point("Forward checking %s" % wfo.name, sub_lap=True) ## the site white list takes site, campaign, memory and core information (_, primaries, _, secondaries, sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys = {} for need in list(primaries) + list(secondaries): if not need in done_by_input: wfi.sendLog('stagor', "missing transfer report for %s" % need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report', '%s does not have a transfer report' % (need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog( 'stagor', "assuming it is OK for secondary %s to have no attached transfers" % need) readys[need] = True done_by_input[need] = {"fake": True} continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor', "%s is ready" % need) print json.dumps(done_by_input[need], indent=2) readys[need] = True else: wfi.sendLog( 'stagor', "%s is not ready \n%s" % (need, json.dumps(done_by_input[need], indent=2))) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor', "all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor', "all needs are fullfilled, already") print json.dumps(readys, indent=2) else: wfi.sendLog('stagor', "missing requirements") copies_needed, _ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries): #+list(secondaries): if endpoint_in_downtime[need] and endpoint_in_downtime[ need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog( 'stagor', "%s has only incomplete endpoint in downtime\n%s" % (need, endpoint_in_downtime[need])) re_transfer = True if not se_allowed_key in available_cache[need]: available_cache[need][ se_allowed_key] = getDatasetBlocksFraction( url, need, sites=se_allowed) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog( 'stagor', "assuming it is OK to move on like this already for %s" % need) jump_ahead = True else: wfi.sendLog( 'stagor', "Available %s times" % available_cache[need][se_allowed_key]) missing_and_downtime = list( set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need])) if missing_and_downtime: wfi.sendLog( 'stagor', "%s is incomplete at %s which is in downtime, trying to move along" % (need, ','.join(missing_and_downtime))) jump_ahead = True else: wfi.sendLog( 'stagor', "continue waiting for transfers for optimum production performance." ) ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries) + list(secondaries): if need in already_stuck: wfi.sendLog('stagor', "%s is stuck, so try to jump ahead" % need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead" % wfo.name details_text += '\n%s wants %s copies' % (wfo.name, copies_needed) copies_needed = max(1, copies_needed - 1) details_text += '\nlowering by one unit to %s' % copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][ se_allowed_key] = getDatasetPresence( url, need, within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update(presence.keys()) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog( 'stagor', "%s is available %s times (%s), at %s" % (need, available, this_check, se_allowed_key)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point this_check = all(done_by_input[need].values()) wfi.sendLog( 'stagor', "%s is this much transfered %s" % (need, json.dumps(done_by_input[need], indent=2))) all_check &= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check and not re_transfer: wfi.sendLog( 'stagor', "needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name, "has to wait a bit more" wfi.sendLog('stagor', "needs to wait a bit more") else: wfi.sendLog('stagor', "not checking availability") if re_transfer: wfi.sendLog( 'stagor', "Sending back to considered because of endpoint in downtime" ) if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add(wfo.name) time_point("Checked affected workflows") if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s" % ('\n'.join(send_back_to_considered)), level='critical') print "-" * 10, "Checking on non-available datasets", "-" * 10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min(available_cache[dsname].values()) really_stuck_dataset = set() for dsname, available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter( Workflow.name == using_it).first() if wf: using_wfos.append(wf) if not len(done_by_input[dsname]): print "For dataset", dsname, "there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending", wf.name, "back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble" % wf.name, level='critical') else: print "would send", wf.name, "back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog( 'stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good." % wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete", dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks, lost_files = findLostBlocksFiles( url, dsname) if (not dsname.endswith('/RAW')) else ([], []) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss, _, n_missing = getDatasetBlockFraction( dsname, lost_block_names) print "We have lost", len( lost_block_names ), "blocks", lost_block_names, "for %f%%" % (100. * fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog( 'stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss' % (dsname, len(lost_block_names), n_missing, 100 * fraction_loss), level='critical') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name, "is doomed. setting to trouble" wf.status = 'trouble' session.commit() sendLog( 'stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss' % (wf.name, dsname, len(lost_block_names), n_missing, 100 * fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog( 'stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s' % (dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join(lost_block_names)), level='critical') known_lost_blocks[dsname] = [ i['name'] for i in lost_blocks ] really_stuck_dataset.add(dsname) if lost_files: fraction_loss, _, n_missing = getDatasetFileFraction( dsname, lost_file_names) print "We have lost", len( lost_file_names ), "files", lost_file_names, "for %f%%" % fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog( 'stagor', '%s is missing %d files, for %d events, %f %% loss' % (dsname, len(lost_file_names), n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name, "is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog( 'stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s' % (dsname, len(lost_file_names), n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [ i['name'] for i in lost_files ] ## should the status be change to held-staging and pending on a ticket missings = [ pid for (pid, d) in done_by_input[dsname].items() if d == False ] print "\t", done_by_input[dsname] print "\tneeds", len(done_by_input[dsname]) print "\tgot", done_by_input[dsname].values().count(True) print "\tmissing", missings missing_in_action[dsname].extend(missings) rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w') rr.write(json.dumps(known_lost_blocks, indent=2)) rr.close() rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w') rr.write(json.dumps(known_lost_files, indent=2)) rr.close() eosFile('%s/incomplete_transfers.json' % monitor_dir, 'w').write(json.dumps(missing_in_action, indent=2)).close() print "Stuck transfers and datasets" print json.dumps(missing_in_action, indent=2) TD = transferDataset() datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add(dataset) for k in datasets_by_phid: #datasets_by_phid[k] = list(datasets_by_phid[k]) TD.add(k, list(datasets_by_phid[k])) #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close() eosFile('%s/really_stuck_dataset.json' % base_eos_dir, 'w').write(json.dumps(list(really_stuck_dataset), indent=2)).close() print '\n' * 2, "Datasets really stuck" print '\n'.join(really_stuck_dataset) ############# ## not going further for what matters ############# return
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc', 'jira']) #if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter( Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() if not wfs: batches = batchInfo().content() for bname in batches: if specific == bname: for pid in batches[bname]: b_wfs = getWorkflowById(url, pid) for wf in b_wfs: wfs.append( session.query(Workflow).filter( Workflow.name == wf).first()) break else: wfs = session.query(Workflow).filter( Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs), "to reject" if len(wfs) > 1: print "\n".join([wfo.name for wfo in wfs]) answer = raw_input('Reject these') if not answer.lower() in ['y', 'yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject", spec return wfi = workflowInfo(url, wfo.name) comment = "" if options.comments: comment = ", reason: " + options.comments if options.keep: wfi.sendLog( 'rejector', 'invalidating the workflow by unified operator%s' % comment) else: wfi.sendLog( 'rejector', 'invalidating the workflow and outputs by unified operator%s' % comment) results = invalidate(url, wfi, only_resub=True, with_output=(not options.keep)) if all(results): print wfo.name, "rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int( schema['ProcessingVersion'] ) + 1 ## dubious str->int conversion else: schema['ProcessingVersion'] = 2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it = 1 while True: t = 'Task%d' % it it += 1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.short_task and schema['RequestType'] == 'TaskChain': translate = {} it = 1 while True: tt = 'Task%d' % it if tt in schema: tname = schema[tt]['TaskName'] ntname = 'T%d' % it translate[tname] = ntname it += 1 schema[tt]['TaskName'] = ntname if 'InputTask' in schema[tt]: itname = schema[tt]['InputTask'] schema[tt]['InputTask'] = translate[itname] else: break for k in schema.get('ProcessingString', {}).keys(): schema['ProcessingString'][ translate[k]] = schema['ProcessingString'].pop(k) for k in schema.get('AcquisitionEra', {}).keys(): schema['AcquisitionEra'][ translate[k]] = schema['AcquisitionEra'].pop(k) if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks, set_to = options.Multicore.split( ':') if ':' in options.Multicore else ( "", options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d' % it it += 1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore', 1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += ( set_to - mcore) * mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int, options.runs.split(',')) if options.PrepID: schema['PrepID'] = options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1, ntask - 1): schema['Task%d' % it]['KeepOutput'] = False schema['TaskChain'] = ntask - 1 schema.pop('Task%d' % ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema(schema) print "submitting" if (options.to_stepchain and (schema['RequestType'] == 'TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" mcore = 0 mem = 0 schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step = 1 s_n = {} while True: if 'Task%d' % step in schema: sname = 'Step%d' % step schema[sname] = schema.pop('Task%d' % step) tmcore = schema[sname].pop('Multicore') tmem = schema[sname].pop('Memory') if mcore and tmcore != mcore: wfi.sendLog( 'rejector', 'the conversion to stepchain encoutered different value of Multicore %d != %d' % (tmcore, mcore)) sendLog( 'rejector', 'the conversion of %s to stepchain encoutered different value of Multicore %d != %d' % (wfo.name, tmcore, mcore), level='critical') mcore = max(mcore, tmcore) mem = max(mem, tmem) schema[sname]['StepName'] = schema[sname].pop( 'TaskName') s_n[schema[sname]['StepName']] = sname if 'InputTask' in schema[sname]: schema[sname]['InputStep'] = schema[sname].pop( 'InputTask') eff = 1. up_s = sname while True: ## climb up a step. supposedely already all converted up_s = s_n.get( schema[up_s].get('InputStep', None), None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get( 'FilterEfficiency', 1.) else: ## or stop there break if not 'KeepOutput' in schema[sname]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema[sname]['KeepOutput'] = False schema['TimePerEvent'] += eff * schema[sname].pop( 'TimePerEvent') schema['SizePerEvent'] += eff * schema[sname].pop( 'SizePerEvent') step += 1 else: break schema['Multicore'] = mcore schema['Memory'] = mem print json.dumps(schema, indent=2) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: msg = "Error in cloning {}".format(wfo.name) print(msg) wfi.sendLog('rejector', msg) # Get the error message time.sleep(5) data = reqMgrClient.requestManagerPost( url, "/reqmgr2/data/request", schema) wfi.sendLog('rejector', data) print json.dumps(schema, indent=2) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog( 'rejector', 'Cloned into %s by unified operator %s' % (newWorkflow, comment)) #wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s' % (comment), do_batch=False) session.commit() else: msg = "Error in rejecting {}: {}".format(wfo.name, results) print(msg) wfi.sendLog('rejector', msg)
def injector(url, options, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco") ) ## regardless of users, pick up all ReReco on the table print len(workflows), "in line" cannot_inject = set() ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) #wl = getWorkLoad(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match # print wfi.request familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: #req_familly = getWorkflowById( url, wl['PrepID']) #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly] pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False if not can_add: continue wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove(wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid, "got", new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid < 0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement, %s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) #if specific: # #wfos = session.query(Workflow).filter(Workflow.name==specific).all() # wfos = session.query(Workflow).filter(Workflow.name.contains(specific)).all() #if not wfos: # if specific: # wfos = session.query(Workflow).filter(Workflow.status=='considered').all() # wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) # wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: wfh.sendLog('assignor', "No go for %s" % wfh.request['Campaign']) n_stalled += 1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) sendEmail( "cannot be assigned due to downtime", "%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered." % wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status continue ## default back to white list to original white list with any data print "Allowed", sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendEmail("cannot be assigned", "%s has no whitelist" % (wfo.name)) n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") sendEmail( "Fallback to EventBased", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name) elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") sendEmail( "Modifying the job per events", "the workflow %s is too heavy in number of jobs explosion" % wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: sendEmail( "issue with event splitting for run-dependent MC", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: sendEmail( "setting lumi splitting for run-dependent MC", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: sendEmail("leaving splitting untouched for PU_RD*", "please check on " + wfo.name) wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) + 1 else: payload['ProcessingVersion'] = 2 payload = reqMgrClient.purgeClonedSchema(payload) if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[ action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in payload: tname = payload[t]['TaskName'] mem_dict[tname] = int(actions[action]) print "Memory set for Task%d" % it else: break payload['Memory'] = mem_dict else: print "Setting memory for non-taskchain workflow" payload['Memory'] = int(actions[action]) print "Memory set to " + actions[action] print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for", initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for", initial["RequestName"] sendLog('actor', 'Failed to make a clone twice for %s!' % initial["RequestName"], level='critical') wfi.sendLog( 'actor', 'Failed to make a clone twice for %s!' % initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[ action] != 'max' and actions[action] != '': factor = int( actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Changing %s (%d) by a factor %d' % (act, split_par[act], factor)) split_par[act] /= factor print "to", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Max splitting set for %s (%d' % (act, split_par[act])) print "Changing %s (%d) " % (act, split_par[act]), split_par[act] = 1 print "to max splitting ", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) print "changing the splitting of", clone print json.dumps(splittings, indent=2) print reqMgrClient.setWorkflowSplitting(url, clone, splittings) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) #wfi.sendLog('actor','Cloned into %s'%clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
def actor(url,options=None): if userLock('actor'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed try: action_list = json.loads(os.popen('curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').read()) ## now we have a list of things that we can take action on except: print "Not able to load action list :(" sendLog('actor','Not able to load action list', level='critical') return print action_list if not action_list: print "EMPTY!" return for wfname in action_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False for key in action_list[wfname]: if key == 'Parameters': tasks = action_list[wfname][key] elif key == 'Action' and action_list[wfname][key] == 'acdc': print "Going to create ACDCs for ", wfname to_acdc = True elif key == 'Action' and action_list[wfname][key] == 'clone': print "Going to clone ", wfname to_clone = True if not to_acdc and not to_clone: sendLog('actor','Action submitted for something other than acdc and clone for workflow %s'%wfname,level='critical') print "Can only do acdcs and clones! Skipping workflow ",wfname continue if not tasks: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) results=[] datasets = set(wfi.request['OutputDatasets']) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow(url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow if 'ACDCs' in action_list[wfname]: children = action_list[wfname]['ACDCs'] for child in children: wfi.sendLog('actor',"rejecting %s"%child) wfi_acdc = workflowInfo(url, child) reqMgrClient.invalidateWorkflow(url, wfi_acdc.request['RequestName'], current_status=wfi_acdc.request['RequestStatus'], cascade=False) datasets.update( wfi_acdc.request['OutputDatasets'] ) #Invalidate all associated output datasets for dataset in datasets: results.append( setDatasetStatus(dataset, 'INVALID') ) if all(map(lambda result : result in ['None',None,True],results)): wfi.sendLog('actor',"%s and children are rejected"%wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) remove_action(wfname) if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned"%wfname) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print tasks all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: sendLog('actor','Cannot create ACDCS for %s because WMErr is blank.'%wfname,level='critical') print "Moving on. WMErr is blank" continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task=True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,list(actions))) team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') else: recovering.add( acdc ) wfi.sendLog('actor',"ACDCs created for %s"%wfname) #=========================================================== if recover and options.do: remove_action(wfname) if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all()) if specific: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all()) dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read()) max_per_round = UC.get("max_per_round").get("assignor", None) max_cpuh_block = UC.get("max_cpuh_block") random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(","))): continue # if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog("assignor", "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]: allowed_secondary.update(CI.campaigns[campaign]["secondaries"]) if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary))) # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog( "assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)), level="critical", ) if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request["RequestStatus"] != "assignment-approved": if not options.test: wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request["RequestStatus"] wfo.status = "away" session.commit() continue else: print wfo.name, wfh.request["RequestStatus"] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog("assignor", "cannot decide on version number") n_stalled += 1 wfo.status = "trouble" session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", []) blocks = [] if "BlockWhitelist" in wfh.request: blocks = wfh.request["BlockWhitelist"] if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"]))) wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"] secondary_aaa = options.secondary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") # pass if secondary_aaa: # just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0] # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = "/store/mc" ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0] ] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog( "assignor", "Holding the data but not allowed %s" % sorted( list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) ), ) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( (set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]) ) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( "assignor", "One of the usable site is in downtime %s" % ([osite in SI.sites_not_ready for osite in opportunistic_sites]), ) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog("assignor", "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( "assignor", "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh), level="critical", ) wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "maxcopies" in CI.campaigns[wfh.request["Campaign"]] ): copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]): not_even_once = not all([available >= 1.0 for available in available_fractions.values()]) wfh.sendLog( "assignor", "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()), ) if down_time and not options.go and not options.early: wfo.status = "considered" session.commit() wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting") # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( "assignor", "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered." % (wfo.name), level="delay", ) continue # pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open("cannot_assign.json").read()) except: pass if ( not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial ): wfh.sendLog( "assignor", "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) known.append(wfo.name) open("cannot_assign.json", "w").write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == "considered": wfh.sendLog("assignor", "setting considered-tried") wfo.status = "considered-tried" session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog("assignor", "cannot be assign with no matched sites") sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical") n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith("T1")] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog("assignor", "Placing the output on %s" % sites_out) parameters = { "SiteWhitelist": sites_allowed, "NonCustodialSites": sites_out, "AutoApproveSubscriptionSites": list(set(sites_out)), "AcquisitionEra": wfh.acquisitionEra(), "ProcessingString": wfh.processingString(), "MergedLFNBase": set_lfn, "ProcessingVersion": version, } ## plain assignment here team = "production" if os.getenv("UNIFIED_TEAM"): team = os.getenv("UNIFIED_TEAM") if options and options.team: team = options.team if False and "T2_CH_CERN" in parameters["SiteWhitelist"]: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"] team = "hlt" ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and "," in v: parameters[key] = filter(None, v.split(",")) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog("assignor", "Setting the number of events per job to 500k max") parameters["EventsPerJob"] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request["Campaign"])) if not options.test: parameters["execute"] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if "EventBased" in split_check.values(): wfh.sendLog("assignor", "Falling back to event splitting.") # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name, level="critical", ) elif "EventsPerJob" in split_check.values(): wfh.sendLog("assignor", "Modifying the number of job per event") # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical" ) # Handle run-dependent MC pstring = wfh.processingString() if "PU_RD" in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if "PU_RD2" in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog( "assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level="critical", ) wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters["EventsPerJob"] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog( "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical" ) wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters["LumisPerJob"] = lumisPerJob else: # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( "assignor", "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level="critical", ) wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = "away" session.commit() n_assigned += 1 wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]: ## lock all outputs flat NLI.lock(secure) # for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
unlock = True else: print "keep a lock on secondary within",delay_days,"days" unlock = False newly_locking.add(dataset) continue time_point("Checked as useful secondary", sub_lap=True) tier = dataset.split('/')[-1] creators = getWorkflowByOutput( url, dataset , details=True) if not creators and not tier == 'RAW' and not '-PromptReco-' in dataset: ds_status = getDatasetStatus( dataset ) if not '-v0/' in dataset and ds_status!=None: #sendEmail('failing get by output','%s has not been produced by anything?'%dataset) sendLog('lockor','failing get by output, %s has not been produced by anything?'%dataset, level='critical') newly_locking.add(dataset) continue else: # does not matter, cannot be an OK dataset unlock = True bad_ds = True creators_status = [r['RequestStatus'] for r in creators] print "Statuses of workflow that made the dataset",dataset,"are",creators_status if len(creators_status) and all([status in ['failed','aborted','rejected','aborted-archived','rejected-archived'] for status in creators_status]): ## crap print "\tunlocking",dataset,"for bad workflow statuses" unlock = True bad_ds = True time_point("Check as necessary output", sub_lap=True)
# #os.system('Unified/rejector.py --clone --to_step %s --comments "Transforming into StepChain for efficiency"'%( wfo.name)) # #os.system('Unified/injector.py %s'% wfi.request['PrepID']) # pass ### all dqmharvest completed to announced right away ### wfs = getWorkflows(url, 'completed', user=None, rtype='DQMHarvest') for wf in wfs: print "closing out",wf reqMgrClient.closeOutWorkflow(url, wf) wfs = getWorkflows(url, 'closed-out', user=None, rtype='DQMHarvest') for wf in wfs: print "announcing",wf reqMgrClient.announceWorkflow(url, wf) wfs = getWorkflows(url, 'failed', user=None, rtype='DQMHarvest') if len(wfs): sendLog('addHoc','There are failed Harvesting requests\n%s'%('\n'.join(sorted( wfs))),level='critical') ### clone all DR80 that end up with issues ### ## #for wfo in session.query(Workflow).filter(Workflow.status == 'assistance-manual').all(): # if not any([c in wfo.name for c in ['RunIISpring16DR80']]): continue # wfi = workflowInfo(url, wfo.name) # if wfi.getRequestNumEvents() < 500000: # ## small workflow that needs recovery : kill-clone # os.system('Unified/rejector.py --clone %s'%wfo.name) """ ### add the value of the delay to announcing datasets data = json.loads(open('%s/announce_delays.json'%monitor_dir).read())
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(soft=['wtc','jira']) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations',query='status=announced') if not invalids: return print len(invalids),"Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge= False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow",wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of",wfo.status,"to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn,"is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc successes = invalidate(url, wfi, only_resub=True, with_output=False) wfi.sendLog('invalidator',"rejection is performed from McM invalidations request") acknowledge= all(successes) text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid ) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting",dataset,"to",invalid_status success = setDatasetStatus(dataset , invalid_status ) if success: acknowledge= True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid ) else: msg = "Could not invalidate {}. Please consider contacting data management team for manual intervention.".format(dataset) print(msg) sendLog('invalidator', msg, level='critical') else: print "\t\t",invalid['type']," type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" ackno_url = '/restapi/invalidations/acknowledge/%s'%( invalid['_id'] ) print "at",ackno_url mcm.get(ackno_url) # prepare the text for batches batches = [] batches.extend(mcm.getA('batches',query='contains=%s'%batch_lookup)) batches = filter(lambda b : b['status'] in ['announced','done','reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to",bid text_to_batch[bid] += text+"\n\n" # prepare the text for requests text_to_request[pid] += text+"\n\n" for bid,text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify',{ "notes" : text, "prepid" : bid}) pass for pid,text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid]})
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) +1 else: payload['ProcessingVersion'] = 2 ## drop parameters on the way to reqmgr2 paramBlacklist = ['BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime', 'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard', 'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize', 'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType', 'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus', 'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions', 'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists', 'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles','checkbox', 'DN', 'AutoApproveSubscriptionSites', 'NonCustodialSites', 'CustodialSites', 'OriginalRequestName', 'Teams', 'OutputModulesLFNBases', 'SiteBlacklist', 'AllowOpportunistic', '_id'] for p in paramBlacklist: if p in payload: payload.pop( p ) pass if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" it=1 while True: t = 'Task%d'%it it+=1 if t in payload: payload[t]['Memory'] = actions[action] print "Memory set for Task%d"%it else: break else: print "Setting memory for non-taskchain workflow" payload['Memory'] = actions[action] print "Memory set to " + actions[action] #This line is doesn't work for some reason # wfi.sendLog('actor','Memory of clone set to %d'%actions[action]) print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for",initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for",initial["RequestName"] sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"],level='critical') wfi.sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittings() if actions[action] != 'Same' and actions[action] != 'max' and actions[action] != '': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: wfi.sendLog('actor','Changing %s (%d) by a factor %d'%( act, split[act], factor)) print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = clone print "changing the splitting of",clone print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: wfi.sendLog('actor','Max splitting set for %s (%d'%( act, split[act])) print "Changing %s (%d) "%( act, split[act]), split[act] = 1 print "to max splitting ",split[act] break split['requestName'] = clone print "changing the splitting of",clone print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, clone, split ) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) wfi.sendLog('actor','Cloned into %s'%clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
task_path_name_limit = UC.get("full_task_path_name_limit") for wf in wfs: if spec and not spec in wf['RequestName']: continue wfi = workflowInfo(url, wf['RequestName'], request=wf) splits = wfi.getSplittingsNew(all_tasks=True) if any([len(s['taskName']) > task_path_name_limit for s in splits]): msg = "The full task path name exceeds the limit of {} characters. The requestor should either shorten task names or have fewer tasks in the chain.".format( task_path_name_limit) wfi.sendLog("GQ", msg) msg += '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfi.getPrepIDs()[0]) sendLog("GQ", msg, level='critical') continue sitewhitelist = wfi.request['SiteWhitelist'] wqs = wfi.getWorkQueue() stats = wfi.getWMStats() if not 'AgentJobInfo' in stats: stats['AgentJobInfo'] = {} ## skip wf that unified does not know about, leaves acdc wfo = session.query(Workflow).filter( Workflow.name == wf['RequestName']).first() if not (wfo or wf['RequestType'] == 'Resubmission'): print "not knonw or not acdc : %s" % (wf['RequestName']) continue
def singleRecovery(url, task, initial, actions, do=False): print "Inside single recovery!" payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : initial['CouchURL'], "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'], "OpenRunningTimeout" : 0 } copy_over = ['PrepID','Campaign','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion'] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c,"not in the initial payload" #a massage ? boost the recovery over the initial wf # payload['RequestPriority'] *= 10 #Max priority is 1M payload['RequestPriority'] = min(500000, payload['RequestPriority']*2 ) ## never above 500k #change parameters based on actions here if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': payload['Memory'] = actions[action] print "Memory set to " + actions[action] ## Taskchains needs to be treated special to set the memory to all tasks if 'TaskChain' in initial: it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: payload[t] = copy.deepcopy(initial[t]) payload[t]['Memory'] = actions[action] else: break if action.startswith('split'): split_alert = (initial['RequestType'] in ['MonteCarlo'] ) for key in initial: if key == 'SplittingAlgo' and (initial[key] in ['EventBased']): split_alert = True elif key.startswith('Task') and key != 'TaskChain': for key2 in initial[key]: if key2 == 'TaskName': print "task",task.split('/')[-1] print "TaskName",initial[key][key2] if (initial[key][key2] == task) and (initial[key][key2] in ['EventBased']): split_alert = True if split_alert: sendLog('actor','Cannot change splitting for %s'%initial['RequestName'],level='warning') print "I should not be doing splitting for this type of request",initial['RequestName'] return None acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'') payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None print "ACDC payload" # print json.dumps( payload , indent=2) print actions ## submit here acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for",initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for",initial["RequestName"] sendLog('actor','Failed twice in making ACDCs for %s!'%initial['RequestName'],level='critical') return None ## change splitting if requested if actions: for action in actions: if action.startswith('split'): acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() if actions[action] != 'Same' and actions[action] != 'max': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) elif 'max' in actions[action]: for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) "%( act, split[act]), split[act] = 1 print "to max splitting ",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific or options.early: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) if specific: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered-tried').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) wfh.sendLog('assignor',"%s to be assigned"%wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False allowed_secondary = set() for campaign in wfh.getCampaigns(): if not CI.go( campaign ): wfh.sendLog('assignor',"No go for %s"%campaign) if not options.go: n_stalled+=1 no_go = True break if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)), level='critical') if not options.go: n_stalled+=1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite in SI.sites_not_ready for osite in opportunistic_sites])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed",sorted(sites_allowed) if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints",sorted(sites_allowed) #if options.partial: # continue if not len(sites_allowed): wfh.sendLog('assignor',"cannot be assign with no matched sites") #sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'%wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira'] ) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows),"in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical') print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical') can_add = False #else: # ##make sure that all blocks get closed # closeAllBlocks(url, d) ## check for any file in phedex, to verify existence _,ph_files,_,_ = getDatasetFiles(url, d) if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ): wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d ) sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) ## match keywords and technical constraints if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): to_convert.add( wf ) wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical') for wf in to_convert: os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget') wf.status = 'forget' session.commit() else: wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp( phedexid = tr.phedexid, workflow = new_wf) session.add( tri ) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
def recoveror(url,specific,options=None): if userLock('recoveror'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() def make_int_keys( d ): for code in d: d[int(code)] = d.pop(code) error_codes_to_recover = UC.get('error_codes_to_recover') error_codes_to_block = UC.get('error_codes_to_block') error_codes_to_notify = UC.get('error_codes_to_notify') make_int_keys( error_codes_to_recover ) make_int_keys( error_codes_to_block ) make_int_keys( error_codes_to_notify ) #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all() wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() ) for wfo in wfs: if specific and not specific in wfo.name:continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves all_errors = {} try: wfi.getSummary() all_errors = wfi.summary['errors'] except: pass print '-'*100 print "Looking at",wfo.name,"for recovery options" recover = True if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('missing lfn','%s wl cache is screwed up'%wfo.name) recover = False if not len(all_errors): print "\tno error for",wfo.name recover = False task_to_recover = defaultdict(list) message_to_ops = "" message_to_user = "" if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE recover = False if wfi.request['RequestType'] in ['MonteCarlo','ReReco']: recover = False if 'Campaign' in wfi.request: c = wfi.request['Campaign'] if c in CI.campaigns and 'recover' in CI.campaigns[c]: recover=CI.campaigns[c]['recover'] for task,errors in all_errors.items(): print "\tTask",task ## collect all error codes and #jobs regardless of step at which it occured all_codes = [] for name, codes in errors.items(): if type(codes)==int: continue all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] ) all_codes.sort(key=lambda i:i[1], reverse=True) sum_failed = sum([l[1] for l in all_codes]) for errorCode,njobs,name,types,details in all_codes: rate = 100*njobs/float(sum_failed) #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name) print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name) added_in_recover=False #if options.go: # force the recovery of any task with error ? if errorCode in error_codes_to_recover: ## the error code is registered for case in error_codes_to_recover[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[recover] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => we should be able to recover that", case['legend'] task_to_recover[task].append( (code,case) ) added_in_recover=True message_to_user = "" else: print "\t\t recoverable but not frequent enough, needs",case['rate'] if errorCode in error_codes_to_block: for case in error_codes_to_block[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[block] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => that error means no ACDC on that workflow", case['legend'] if not options.go: message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 ) recover = False added_in_recover=False if errorCode in error_codes_to_notify and not added_in_recover: print "\t\t => we should notify people on this" message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 ) if message_to_user: print wfo.name,"to be notified to user(DUMMY)",message_to_user if message_to_ops: sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) if len(task_to_recover) != len(all_errors): print "Should not be doing partial ACDC. skipping" #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name) sendLog('recoveror','do not want to make partial acdc on %s'%wfo.name, level='warning') recover = False if task_to_recover and recover: print "Initiating recovery" print ', '.join(task_to_recover.keys()),"to be recovered" recovering=set() for task in task_to_recover: print "Will be making a recovery workflow for",task ## from here you can fetch known solutions, to known error codes actions = list(set([case['solution'] for code,case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request , actions, do = options.do) if not acdc: if options.do: if recovering: print wfo.name,"has been partially ACDCed. Needs manual attention" #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**']) sendLog('recoveror', "%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical') continue else: print wfo.name,"failed recovery once" #break continue else: print "no action to take further" #sendEmail("an ACDC that can be done automatically","please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details", destination=['*****@*****.**']) sendLog('recoveror', "ACDC for %s can be done automatically"% wfo.name, level='critical') continue ## and assign it ? team = wfi.request['Teams'][0] parameters={ #'SiteWhitelist' : wfi.request['SiteWhitelist'], 'SiteWhitelist' : SI.sites_ready, 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True #sendEmail("an ACDC was done and WAS assigned", "%s was assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**']) wfi.sendLog('recoveror',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc #sendEmail("an ACDC was done and need to be assigned", "%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**']) sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical') result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not asigned" #sendEmail("an ACDC was done and need to be assigned","%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**']) sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical') else: recovering.add( acdc ) current = None if recovering: #if all went well, set the status to -recovering current = wfo.status if options.ass: current = current.replace('recovery','recovering') else: current = 'assistance-manual' print 'created ACDC: '+', '.join( recovering ) else: ## was set to be recovered, and no acdc was made current = 'assistance-manual' if current: print wfo.name,"setting the status to",current wfo.status = current session.commit() else: ## this workflow should be handled manually at that point print wfo.name,"needs manual intervention" wfo.status = 'assistance-manual' session.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() #NLI = newLockInfo() #if not NLI.free(): return LI = lockInfo() if not LI.free(): return mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).filter( ~Workflow.status.contains('custodial')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] max_per_round = UC.get('max_per_round').get('transferor', None) print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) all_to_include = session.query(Workflow).filter( Workflow.status.startswith('considered')).all() if len(cache) > 2000: max_to_include = max_per_round random.shuffle(cache) ## randomize first by wf name cache = sorted(cache, key=lambda r: r['RequestPriority'], reverse=True) ## order by prio highest = [r['RequestName'] for r in cache[:max_to_include]] all_to_include = [wfo for wfo in all_to_include if wfo.name in highest] print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len( all_to_include) for wfo in all_to_include: print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = defaultdict(float) ignored_input_sizes = defaultdict(float) input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read()) stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print wfo.name, "staging" (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() blocks = wfh.getBlocks() for prim in primary: ds_s = dss.get(prim, blocks=blocks) if prim in stucks: wfh.sendLog( 'transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s)) ignored_input_sizes[prim] = max(ds_s, ignored_input_sizes[prim]) else: input_sizes[prim] = max(ds_s, input_sizes[prim]) wfh.sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s)) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." input_blocks = {} for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() blocks = wfh.getBlocks() input_blocks[wfo.name] = blocks for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim, blocks=blocks) input_sizes[prim] = max(prim_size, input_sizes[prim]) primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendLog( "transferor", "No request in staging, using first request to set priority limit") if len(wfs_and_wfh): min_transfer_priority = wfs_and_wfh[0][1].request[ 'RequestPriority'] in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority'] else: return cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = defaultdict(float) went_over_budget = False destination_cache = {} no_goes = set() if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: if wfh.isRelval(): wfo.status = 'forget' else: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() blocks = input_blocks.get(wfo.name, wfh.getBlocks()) if blocks: print "Reading only", len(blocks), "blocks in input" this_load = sum([dss.get(prim, blocks=blocks) for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = {} overide_parameters = {} check_secondary = (not wfh.isRelval()) output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: overide_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'transferor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('transferor', 'These data tiers %s are not allowed in %s' % (','.join(banned_tier), wfo.name), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('transferor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('transferor', critical_msg, level='critical') if not options.go: no_go = True for sec in secondary: if sec in allowed_secondary: overide_parameters.update(allowed_secondary[sec]) if 'SiteWhitelist' in overide_parameters: sites_allowed = list( set(sites_allowed) & set(overide_parameters['SiteWhitelist'])) wfh.sendLog( 'transferor', 'Intersecting with the overriding whitelist parameters, allowed sites become {}' .format(sites_allowed)) if no_go: continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue # break ## try this for a while to make things faster ## the site white list considers site, campaign, memory and core information if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): LI.lock(dataset, reason='staging') if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) if blocks: print "limiting to blocks", "\n".join(sorted(blocks)) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be #prim_destination = [site for site in destinations.keys() if not site in prim_location] prim_destination = [ site for (site, info) in destinations.items() if info['data_fraction'] == 1 and info['completion'] != 100 ] ## veto the site with no current disk space, for things that are not relval prim_destination = [ site for site in prim_destination if (SI.disk[site] or wfh.isRelval()) ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "Counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers potential_destinations = len(prim_to_distribute) #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## do we want to restrict transfers if the amount of site in vetoe are too large ? wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: existings = session.query(TransferImp).filter( TransferImp.phedexid == int(latching)).filter( TransferImp.workflow_id == wfo.id).all() if not existings: tri = TransferImp(phedexid=int(latching), workflow=wfo) print "adding", wfo.id, "with phedexid", latching session.add(tri) else: for existing in existings: existing.active = True session.flush() can_go = False transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The input is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim)) sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim), level='critical') print json.dumps(prim_to_distribute, indent=2) print json.dumps(prim_location, indent=2) print json.dumps(prim_destination, indent=2) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] print "changed to" print json.dumps(prim_to_distribute, indent=2) if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) ## prune the blocks/destination that are already in the making, so that subscription don't overlap for site in spreading: for block in list(spreading[site]): if site in destinations and block in destinations[ site]['blocks'].keys(): ## prune it spreading[site].remove(block) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) else: can_go = False allowed = False if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] if 'SecondaryLocation' in overide_parameters: override_sec_destination = overide_parameters[ 'SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = set( [SI.CE_to_SE(site) for site in sites_allowed]) destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if k in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] print sec, json.dumps(destinations, indent=2) sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] ## this is in SE else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] ## how to make unified understand that it has to wait for the secondary if the sec_destination and #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in sec_location ] #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [ site for site in sec_to_distribute if not SI.CE_to_SE(site) in sec_destination ] presitespace_sec_to_distribute = copy.deepcopy( sec_to_distribute) #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] #sec_to_distribute = [site for site in sec_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] sec_to_distribute = [ site for site in sec_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## at this point you have a problem if len(sec_to_distribute) == 0 and len( presitespace_sec_to_distribute): sendLog( 'transferor', '%s is getting no possible destinations because of lack of space. To be decided what to do in general' % (sec), level='critical') if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size or wfh.isRelval(): wfh.sendLog('transferor', 'Sending %s to %s' % (sec, site)) all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog( 'transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024, wfo.name), level='critical') wfh.sendLog( 'transferor', '%s is too big (%s) for %s (%s). will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024)) else: ## this is bas overall print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog( 'transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(sorted(no_goes)), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets #if site in SI.sites_veto_transfer: # print site,"does not want transfers" # continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue transfered_items = defaultdict(set) if execute: priority = 'normal' cds = [ ds for ds in set(datasets + block_datasets) if ds in max_priority ] ## bucketize the transfers by priority of workflows prioritized_items = defaultdict(set) for item in items_to_transfer: d = item.split('#')[0] p = max_priority.get(d, 80000) q = 'normal' if p > 100000: q = 'reserved' elif p < 70000: q = 'low' prioritized_items[q].add(item) for priority, items in prioritized_items.items(): result = makeReplicaRequest(url, site_se, list(items), 'prestaging', priority=priority, approve=True) if result: these_transfers = [ o['id'] for o in result['phedex']['request_created'] ] #phedexids.extend( these_transfers ) for ph in these_transfers: transfered_items[ph].update(items) else: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items, site_se), level='critical') #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True) #phedexids = [o['id'] for o in result['phedex']['request_created']]: #else: # #result= {'phedex':{'request_created' : []}} # phedexids = [] # fake_id-=1 if not transfered_items: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items_to_transfer, site), level='critical') continue for phedexid, items in transfered_items.items(): print phedexid, "transfer created" for transfering in list( set(map(lambda it: it.split('#')[0], items))): for wfid in workflow_dependencies[transfering]: new_transfer = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.workflow_id == wfid).first() if not new_transfer: new_transfer = TransferImp( phedexid=phedexid, workflow=session.query(Workflow).get(wfid)) session.add(new_transfer) else: new_transfer.active = True wf_id_in_prestaging.add(wfid) #session.commit() for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" #session.commit() ## one big session commit at the end that everything went fine session.commit()
#session.commit() wfs = getWorkflows(url, 'assigned', details=True) now = time.mktime(time.gmtime()) for wf in wfs: assigned_log = filter(lambda change: change["Status"] in ["assigned"], wf['RequestTransition']) if assigned_log: then = assigned_log[-1]['UpdateTime'] since = (now - then) / float(1 * 24 * 60 * 60.) if since > 1.: print "workflow", wf[ 'RequestName'], "is assigned since", then, " that is", since, "days" sendLog('GQ', 'The workflow %s has been assigned for %.2f days' % (wf['RequestName'], since), level='critical') overall_timeout = 14 #days may_have_one = set() may_have_one.update([ wfo.name for wfo in session.query(Workflow).filter( Workflow.status.startswith('away')).all() ]) may_have_one.update([ wfo.name for wfo in session.query(Workflow).filter( Workflow.status.startswith('assistance')).all() ]) wfs = [] wfs.extend(getWorkflows(url, 'running-open', details=True))
def recoveror(url, specific, options=None): if userLock('recoveror'): return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() use_recoveror = UC.get('use_recoveror') if not use_recoveror and not options.go: print "We are told not to run recoveror" return def make_int_keys(d): for code in d: d[int(code)] = d.pop(code) error_codes_to_recover = UC.get('error_codes_to_recover') error_codes_to_block = UC.get('error_codes_to_block') error_codes_to_notify = UC.get('error_codes_to_notify') make_int_keys(error_codes_to_recover) make_int_keys(error_codes_to_block) make_int_keys(error_codes_to_notify) #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all() wfs = session.query(Workflow).filter( Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter( Workflow.status == 'assistance-manual').all()) for wfo in wfs: if specific and not specific in wfo.name: continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves all_errors = {} try: ## this is clearly very truncated and should be changed completely wfi.getSummary() all_errors = wfi.summary['errors'] except: pass print '-' * 100 print "Looking at", wfo.name, "for recovery options" recover = True if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('missing lfn', '%s wl cache is screwed up' % wfo.name) recover = False if not len(all_errors): print "\tno error for", wfo.name recover = False task_to_recover = defaultdict(list) message_to_ops = "" message_to_user = "" if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE recover = False if wfi.request['RequestType'] in ['MonteCarlo', 'ReReco']: recover = False if 'Campaign' in wfi.request: c = wfi.request['Campaign'] if c in CI.campaigns and 'recover' in CI.campaigns[c]: recover = CI.campaigns[c]['recover'] for task, errors in all_errors.items(): print "\tTask", task ## collect all error codes and #jobs regardless of step at which it occured all_codes = [] for name, codes in errors.items(): if type(codes) == int: continue all_codes.extend([ (int(code), info['jobs'], name, list(set([e['type'] for e in info['errors']])), list(set([e['details'] for e in info['errors']]))) for code, info in codes.items() ]) all_codes.sort(key=lambda i: i[1], reverse=True) sum_failed = sum([l[1] for l in all_codes]) for errorCode, njobs, name, types, details in all_codes: rate = 100 * njobs / float(sum_failed) #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name) print( "\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s" ) % (njobs, "%4.2f" % rate, errorCode, ','.join(types), name) added_in_recover = False #if options.go: # force the recovery of any task with error ? if errorCode in error_codes_to_recover: ## the error code is registered for case in error_codes_to_recover[errorCode]: match = case['details'] matched = (match == None) if not matched: matched = False for detail in details: if match in detail: print "[recover] Could find keyword", match, "in" print 50 * "#" print detail print 50 * "#" matched = True break if matched and rate > case['rate']: print "\t\t => we should be able to recover that", case[ 'legend'] task_to_recover[task].append((code, case)) added_in_recover = True message_to_user = "" else: print "\t\t recoverable but not frequent enough, needs", case[ 'rate'] if errorCode in error_codes_to_block: for case in error_codes_to_block[errorCode]: match = case['details'] matched = (match == None) if not matched: matched = False for detail in details: if match in detail: print "[block] Could find keyword", match, "in" print 50 * "#" print detail print 50 * "#" matched = True break if matched and rate > case['rate']: print "\t\t => that error means no ACDC on that workflow", case[ 'legend'] if not options.go: message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n " % ( wfo.name, errorCode, '#' * 50) recover = False added_in_recover = False if errorCode in error_codes_to_notify and not added_in_recover: print "\t\t => we should notify people on this" message_to_user += "%s has an error %s in processing.\n%s\n" % ( wfo.name, errorCode, '#' * 50) if message_to_user: print wfo.name, "to be notified to user(DUMMY)", message_to_user if message_to_ops: #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) sendLog('recoveror', message_to_ops, level='warning') if len(task_to_recover) != len(all_errors): print "Should not be doing partial ACDC. skipping" #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name) sendLog('recoveror', 'do not want to make partial acdc on %s' % wfo.name, level='warning') recover = False if task_to_recover and recover: print "Initiating recovery" print ', '.join(task_to_recover.keys()), "to be recovered" recovering = set() for task in task_to_recover: print "Will be making a recovery workflow for", task ## from here you can fetch known solutions, to known error codes actions = list( set([ case['solution'] for code, case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfo.name, "has been partially ACDCed. Needs manual attention" #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**']) sendLog('recoveror', "%s has had %s/%s recoveries %s only" % (wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical') continue else: print wfo.name, "failed recovery once" #break continue else: print "no action to take further" sendLog('recoveror', "ACDC for %s can be done automatically" % wfo.name, level='critical') continue ## and assign it ? team = wfi.request['Team'] #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out. assign_to_sites = set(SI.all_sites) parameters = { #'SiteWhitelist' : wfi.request['SiteWhitelist'], 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'TrustSitelists' in wfi.request and wfi.request[ 'TrustSitelists']: parameters['TrustSitelists'] = True if 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True wfi.sendLog('recoveror', "%s was assigned for recovery" % acdc) else: print "no assignment done with this ACDC", acdc sendLog('recoveror', "%s needs to be assigned" % (acdc), level='critical') result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc, "was not asigned" sendLog('recoveror', "%s needs to be assigned" % (acdc), level='critical') else: recovering.add(acdc) current = None if recovering: #if all went well, set the status to -recovering current = wfo.status if options.ass: current = current.replace('recovery', 'recovering') else: current = 'assistance-manual' print 'created ACDC: ' + ', '.join(recovering) else: ## was set to be recovered, and no acdc was made current = 'assistance-manual' if current: print wfo.name, "setting the status to", current wfo.status = current session.commit() else: ## this workflow should be handled manually at that point print wfo.name, "needs manual intervention" wfo.status = 'assistance-manual' session.commit()
def completor(url, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) ## by workflow a list of fraction / timestamps completions = json.loads( open('%s/completions.json'%monitor_dir).read()) good_fractions = {} timeout = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] long_lasting = {} overrides = getForceCompletes() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") #wfs_no_location_in_GQ = set() #block_locations = defaultdict(lambda : defaultdict(list)) #wfs_no_location_in_GQ = defaultdict(list) set_force_complete = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False if not any([c in wfo.name for c in good_fractions]): skip=True for user,spec in overrides.items(): if wfi.request['RequestStatus']!='force-complete': if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name)) wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) if user == 'mcm' and use_mcm: for pid in wfi.getPrepIDs(): mcm.delete('/restapi/requests/forcecomplete/%s'%pid) #sendEmail('completor test','not force completing automatically, you have to go back to it') #skip=False break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue c = wfi.request['Campaign'] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60*60*24.) running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60.*60.*24.) delay = now - then ## in days (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority monitor_delay = 7 allowed_delay = 14 if c in timeout: allowed_delay = timeout[c] monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early if delay <= monitor_delay: continue long_lasting[wfo.name] = { "delay" : delay } percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name} ## get completion fraction event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion=0. event_completion=0. if lumi_expected: lumi_completion = lumi_count / float( lumi_expected ) if event_expected: event_completion = event_count / float( event_expected ) #take the less optimistic percent_completions[output] = min( lumi_completion, event_completion ) completions[output]['checkpoints'].append( (now, event_completion ) ) if all([percent_completions[out] >= good_fraction for out in percent_completions]): wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, json.dumps( percent_completions, indent=2 ) )) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction, json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days if delay <= allowed_delay: continue ## find ACDCs that might be running if max_force>0: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) print "going for force-complete of",wfo.name wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: wfi.sendLog('completor',"too many completion this round, cannot force complete") ## do it once only for testing #break if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) #if wfs_no_location_in_GQ: # sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ))) #sendEmail("long lasting workflow",text) ## you can check the log print text
def completor(url, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('completor',None) if max_per_round and not specific: wfs = wfs[:max_per_round] ## by workflow a list of fraction / timestamps completions = json.loads( open('%s/completions.json'%monitor_dir).read()) good_fractions = {} timeout = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] long_lasting = {} overrides = getForceCompletes() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") #wfs_no_location_in_GQ = set() #block_locations = defaultdict(lambda : defaultdict(list)) #wfs_no_location_in_GQ = defaultdict(list) set_force_complete = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False if not any([c in wfo.name for c in good_fractions]): skip=True for user,spec in overrides.items(): if wfi.request['RequestStatus']!='force-complete': if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name)) wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue c = wfi.request['Campaign'] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60*60*24.) running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60.*60.*24.) delay = now - then ## in days (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority monitor_delay = 7 allowed_delay = 14 if c in timeout: allowed_delay = timeout[c] monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early if delay <= monitor_delay: continue long_lasting[wfo.name] = { "delay" : delay } percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name} ## get completion fraction event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion=0. event_completion=0. if lumi_expected: lumi_completion = lumi_count / float( lumi_expected ) if event_expected: event_completion = event_count / float( event_expected ) #take the less optimistic percent_completions[output] = min( lumi_completion, event_completion ) completions[output]['checkpoints'].append( (now, event_completion ) ) if all([percent_completions[out] >= good_fraction for out in percent_completions]): wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, json.dumps( percent_completions, indent=2 ) )) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction, json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days if delay <= allowed_delay: continue ## find ACDCs that might be running if max_force>0: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) print "going for force-complete of",wfo.name wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: wfi.sendLog('completor',"too many completion this round, cannot force complete") ## do it once only for testing #break if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) #if wfs_no_location_in_GQ: # sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ))) #sendEmail("long lasting workflow",text) ## you can check the log print text
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read()) lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read()) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys()+lost_files.keys()): b,f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset,"has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset,"has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] try: cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) except: print "inexisting transfer statuses. starting fresh" cached_transfer_statuses = {} transfer_statuses = {} ## pop all that are now in negative values for phedexid in cached_transfer_statuses.keys(): transfers = session.query(Transfer).filter(Transfer.phedexid==int(phedexid)).all() if not transfers: print phedexid,"does not look relevant to be in cache anymore. poping" print cached_transfer_statuses.pop( phedexid ) ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']: wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus']) wfi.status='away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too print wfo.name,"is",wfi.request['RequestStatus'] sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) wfois.append( (wfo,wfi) ) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): needs[wfo.name].append( dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset)) open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## phedexid are set negative when not relevant anymore # probably there is a db schema that would allow much faster and simpler query for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': sendLog('stagor',"\t%s is staging for %s"%(transfer.phedexid, tr_wf.name)) skip=False if skip: sendLog('stagor',"setting %s to negative value"%transfer.phedexid) transfer.phedexid = -transfer.phedexid session.commit() continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: sendLog('stagor', "%s is not yet approved"%transfer.phedexid) approveSubscription(url, transfer.phedexid) continue ## check on transfer completion if str(transfer.phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor',"read %s from cache"%transfer.phedexid) checks = cached_transfer_statuses[str(transfer.phedexid)] else: checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) ## just write this out transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: sendLog('stagor',"Checks for %s are %s"%( transfer.phedexid, [node.values() for node in checks.values()])) done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done ## for those that are in staging, and the destination site is in drain #if not done and tr_wf.status == 'staging': for ds in checks: for s,v in checks[ds].items(): dataset_endpoints[ds].add( s ) if done: ## transfer.status = 'done' sendLog('stagor',"%s is done"%transfer.phedexid) cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) else: sendLog('stagor',"%s is not finished %s"%(transfer.phedexid, pprint.pformat( checks ))) pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough] sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready] if sites_incomplete_down: sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds)) #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough] #endpoint[ds].update( sites_complete ) #endpoint[ds].update( sites_incomplete ) #endpoint_completed[ds].update( sites_complete ) endpoint_incompleted[ds].update( sites_incomplete ) endpoint_in_downtime[ds].update( sites_incomplete_down ) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps( endpoint_in_downtime , indent=2) open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)) already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ) missing_in_action = defaultdict(list) print "-"*10,"Checking on workflows in staging","-"*10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda : defaultdict(float)) presence_cache = defaultdict(dict) for wfo,wfi in wfois: print "#"*30 ## the site white list takes site, campaign, memory and core information (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys={} for need in list(primaries)+list(secondaries): if not need in done_by_input: wfi.sendLog('stagor',"missing transfer report for %s"%need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report','%s does not have a transfer report'%(need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need) readys[need] = True done_by_input[need] = { "fake" : True } continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor',"%s is ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = True else: wfi.sendLog('stagor',"%s is not ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor',"all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor',"all needs are fullfilled, already") print json.dumps( readys, indent=2 ) else: wfi.sendLog('stagor',"missing requirements") copies_needed,_ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries):#+list(secondaries): if endpoint_in_downtime[need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need) re_transfer=True if not se_allowed_key in available_cache[need]: available_cache[need][se_allowed_key] = getDatasetBlocksFraction( url , need, sites=se_allowed ) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need) jump_ahead = True ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries)+list(secondaries): if need in already_stuck: wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead"%wfo.name details_text += '\n%s wants %s copies'%(wfo.name,copies_needed) copies_needed = max(1,copies_needed-1) details_text += '\nlowering by one unit to %s'%copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update( presence.keys() ) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog('stagor', "%s is available %s times %s"%( need, available, this_check)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon this_check = all(done_by_input[need].values()) wfi.sendLog('stagor',"%s is all transfered %s"%(need, json.dumps(done_by_input[need], indent=2))) all_check&= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check: wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name,"has to wait a bit more" wfi.sendLog('stagor',"needs to wait a bit more") else: wfi.sendLog('stagor',"not checking availability") if re_transfer: wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime") if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add( wfo.name ) if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical') print "-"*10,"Checking on non-available datasets","-"*10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min( available_cache[dsname].values() ) for dsname,available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical') else: print "would send",wf.name,"back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete",dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names) print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='warning') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() #sendEmail('doomed workflow','%s has too much loss on the input dataset %s. please check on stagor logs https://cmst2.web.cern.ch/cmst2/unified/logs/stagor/last.log'%(wf.name, dsname)) sendLog('stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical') known_lost_blocks[dsname] = [i['name'] for i in lost_blocks] if lost_files: fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names) print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [i['name'] for i in lost_files] ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",len(done_by_input[dsname]) print "\tgot",done_by_input[dsname].values().count(True) print "\tmissing",missings missing_in_action[dsname].extend( missings ) rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_blocks, indent=2)) rr.close() rr= open('%s/lost_files_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_files, indent=2)) rr.close() open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) print "Going further and make a report of stuck transfers" datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" really_stuck_dataset = set() transfer_timeout = UC.get("transfer_timeout") transfer_lowrate = UC.get("transfer_lowrate") for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>transfer_timeout and rate<transfer_lowrate: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) print "add",dataset,"to really stuck" report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print '\n'*2,"Datasets really stuck" print '\n'.join( really_stuck_dataset ) print '\n'*2,"report written at https://cmst2.web.cern.ch/cmst2/unified/logs/incomplete_transfers.log" print report stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]) print '\n'*2,'Stuck dataset transfers' print json.dumps(stuck_transfers , indent=2) open('%s/stuck_transfers.json'%monitor_dir,'w').write( json.dumps(stuck_transfers , indent=2) ) open('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report )
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=None min_transfer_priority=None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read()) for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get( prim ) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh) if in_transfer_priority==None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None or in_transfer_priority ==None: print "nothing is lining up for transfer" sendEmail("no request in staging","no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, ignored_values ) ) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, considered_values) ) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get( prim ) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle( wfs_and_wfh ) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size( i, j): if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) ) else: return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor',None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo,wfh) in wfs_and_wfh: print wfo.name,"to be transfered with priority",wfh.request['RequestPriority'] if wfh.request['RequestStatus']!='assignment-approved': if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status)) continue (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) no_budget = False if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add( wfo.name ) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if secondary: if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: if wfh.request['RequestType'] in ['ReReco']: announced,is_real = True,True else: announced,is_real = check_mcm( wfo.name ) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if not sites_allowed: wfh.sendLog('transferor',"not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) )) if blocks: print "Reading",len(blocks),"in block whitelist" can_go = True staging=False allowed=True primary_destinations = set() if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add( wfo.id ) max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) wfh.sendLog('transferor',"Would make %s from cpu requirement %s"%( copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] if len(prim_location) >= copies_needed: wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location))) continue copies_needed = max(0,copies_needed - len(prim_location)) wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed) copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute)) if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed) if copies_needed == 0: wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers) can_go = True continue elif len(prim_to_distribute)==0: wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available") prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical') wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim)) staging=False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys()))) for (site,items) in spreading.items(): all_transfers[site].extend( items ) transfers_per_sites[site] += 1 primary_destinations.add( site ) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation'] print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination)) if len( sec_to_distribute )>0: print "secondary could go to",sorted(sec_to_distribute) sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging") wfo.status = 'staging' needs_transfer+=1 else: wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed)) wfo.status = 'staged' passing_along+=1 wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() wfh.sendLog('transferor',"needs a transfer") needs_transfer+=1 passing_along+=1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks'%len(blocks) details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets'% len(datasets) details_text += '\n\t%s'%sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: priority = 'normal' cds = [ds for ds in datasets+block_datasets if ds in max_priority] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds]>=90000 for ds in cds]): priority = 'high' elif all([max_priority[ds]<80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = False for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) if not primary_aaa: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) if 'parameters' in assign_parameters: parameters.update( assign_parameters['parameters'] ) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters #parameters.update( CI.parameters(wfh.request['Campaign']) ) parameters.update( assign_parameters.get('parameters',{}) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'NoGo' in split_check.values(): wfh.sendLog('assignor', "Failing splitting check") sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical') n_stalled+=1 continue if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical') ## we have a problem here, that EventBased should never be used as a backup if not options.go: n_stalled+=1 continue continue ## skip all together elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per job") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') elif 'EventsPerLumi' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this") # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat #NLI.lock( secure ) LI.lock( secure, reason = 'assigning') #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign. Please check the logs") print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def spawn_harvesting(url, wfi, sites_for_DQMHarvest): SI = global_SI() all_OK = {} requests = [] outputs = wfi.request['OutputDatasets'] if ('EnableHarvesting' in wfi.request and not wfi.request['EnableHarvesting']) and ( 'DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']): if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('screwed up wl cache', '%s wl cache is bad' % (wfi.request['RequestName'])) all_OK['fake'] = False return all_OK, requests wfi = workflowInfo(url, wfi.request['RequestName']) dqms = [out for out in outputs if '/DQM' in out] #if not all([in_full[dqm_input] for dqm_input in dqms]): # wfi.sendLog('closor',"will not be able to assign the harvesting: holding up") # for dqm_input in dqms: # all_OK[dqm_input] = False ## raise the subscription to high priority for dqm_input in dqms: ## handle it properly harvesting_schema = { 'Requestor': os.getenv('USER'), 'RequestType': 'DQMHarvest', 'Group': 'DATAOPS' } copy_over = [ 'AcquisitionEra', 'ProcessingString', 'DQMUploadUrl', 'CMSSWVersion', 'CouchDBName', 'CouchWorkloadDBName', 'ConfigCacheUrl', 'DbsUrl', 'inputMode', 'DQMConfigCacheID', 'OpenRunningTimeout', 'ScramArch', 'CMSSWVersion', 'Campaign', 'Memory', #dummy 'SizePerEvent', #dummy 'GlobalTag', #dummy ] for item in copy_over: if item in wfi.request: harvesting_schema[item] = copy.deepcopy(wfi.request[item]) else: print item, "is not in initial schema" harvesting_schema['InputDataset'] = dqm_input harvesting_schema['TimePerEvent'] = 1 harvesting_schema['PrepID'] = 'Harvest-' + wfi.request['PrepID'] if len(wfi.request['RequestString']) > 60: wfi.request['RequestString'] = wfi.request[ 'RequestString'][:60] print "truncating request string", wfi.request['RequestString'] harvesting_schema[ 'RequestString'] = 'HARVEST-' + wfi.request['RequestString'] harvesting_schema['DQMHarvestUnit'] = 'byRun' harvesting_schema['RequestPriority'] = min( wfi.request['RequestPriority'] * 10, 999999) harvest_request = reqMgrClient.submitWorkflow( url, harvesting_schema) if not harvest_request: print "Error in making harvesting for", wfi.request[ 'RequestName'] print "schema" print json.dumps(harvesting_schema, indent=2) harvest_request = reqMgrClient.submitWorkflow( url, harvesting_schema) if not harvest_request: print "Error twice in harvesting for", wfi.request[ 'RequestName'] print "schema" print json.dumps(harvesting_schema, indent=2) if harvest_request: requests.append(harvest_request) ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely data = reqMgrClient.setWorkflowApproved(url, harvest_request) print "created", harvest_request, "for harvesting of", dqm_input wfi.sendLog( 'closor', "created %s for harvesting of %s" % (harvest_request, dqm_input)) ## assign it directly team = wfi.request['Team'] parameters = { 'SiteWhitelist': [ SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites'] ], 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], 'execute': True } #if in_full[dqm_input]: # print "using full copy at",in_full[dqm_input] # parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]] #else: # print "cannot do anything if not having a full copy somewhere" # all_OK[dqm_input]=False # continue parameters['SiteWhitelist'] = sites_for_DQMHarvest result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters) if not result: #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog( 'closor', '%s was created at announcement of %s in %s, failed to assign' % (harvest_request, dqm_input, wfi.request['RequestName'])) sendLog( 'closor', '%s was created at announcement of %s in %s, failed to assign' % (harvest_request, dqm_input, wfi.request['RequestName']), level='critical') else: #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) wfi.sendLog( 'closor', '%s was created at announcement of %s in %s, and assigned' % (harvest_request, dqm_input, wfi.request['RequestName'])) else: #print "could not make the harvesting for",wfo.name,"not announcing" wfi.sendLog('closor', "could not make the harvesting request") sendLog('closor', "could not make the harvesting request for %s" % wfi.request['RequestName'], level='critical') all_OK[dqm_input] = False return (all_OK, requests)
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name) continue if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') if options.force: status = True results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace('announce','announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') #batches = json.loads(open('batches.json').read()) for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to )
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_extreme_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs), "closing" th_start = time.mktime(time.gmtime()) for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue if not options.manual and ( 'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159' .lower() in (wfo.name).lower() or 'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986' .lower() in (wfo.name).lower()): continue closers.append( CloseBuster( wfo=wfo, url=url, CI=CI, UC=UC, jump_the_line=jump_the_line, batch_goodness=batch_goodness, batch_go=batch_go, #stats = stats, batch_warnings=batch_warnings, batch_extreme_warnings=batch_extreme_warnings, all_late_files=all_late_files, held=held, )) run_threads = ThreadHandler(threads=closers, n_threads=options.threads, sleepy=10, timeout=None, verbose=True, label='closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira', False) else None print len( run_threads.threads), "finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out session.add(outO) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid": to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop(to.wfo.name) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop - th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads / run_threads.n_threads) > 0: sendLog('checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads), level='critical') sendEmail( 'checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" #if batch_warnings[ bname ]: # issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness # issues+="\n".join( sorted( batch_warnings[ bname ] )) # issues+="\n\n" if batch_extreme_warnings[bname]: subject = "Low Statistics for %s" % bname issues = "The following datasets have outstanding completion (<50%%) issues:\n\n" issues += "\n".join(sorted(batch_extreme_warnings[bname])) issues += "\n\n" elif batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = "" text += "Dear all,\n\n" text += "A batch of release validation workflows has finished.\n\n" text += "Batch ID:\n\n" text += "%s\n\n" % (bname) text += "Detail of the workflows\n\n" text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % ( bname) text += "%s\n\n" % (issues) text += "This is an automated message.\n\n" text += "" to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname) deleteCampaignConfig(bname) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor', 'Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def closor(url, specific=None): if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #LI = lockInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: print "Failed DDM, retrying a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") results.append( status ) if status == None: wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( n_copies, out,destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow is announced") else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) sendEmail('waiting for files to announce', subject) sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_parentage_to_announce = UC.get('check_parentage_to_announce') check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'staged', 'staging', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append(Output(datasetname=out)) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[wfi.getCampaign()].add(completion_line) if fraction < 50: self.batch_extreme_warnings[wfi.getCampaign()].add( completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on #in_full = {} for out in outputs: all_OK[out] = True #in_full[out] = [] #presence = getDatasetPresence( url, out ) #where = [site for site,info in presence.items() if info[0]] #if where: # all_OK[out] = True # print out,"is in full at",",".join(where) # in_full[out] = copy.deepcopy(where) #else: # going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] # wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) # at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) # else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) # print json.dumps( at_destination ) # print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! #for there in going_to: # late_info = findLateFiles(url, out, going_to = there ) # for l in late_info: # l.update({"workflow":wfo.name,"dataset":out}) # self.all_late_files.extend( late_info ) # if check_fullcopy_to_announce: ## only set this false if the check is relevant # all_OK[out] = False ## verify if we have to do harvesting #if not options.no_harvest and not jump_the_line: # #(OK, requests) = spawn_harvesting(url, wfi, in_full) # sites_for_DQMHarvest = UC.get("sites_for_DQMHarvest") # (OK, requests) = spawn_harvesting(url, wfi, sites_for_DQMHarvest) # print "Harvesting workflow has been created and assigned to: " # print sites_for_DQMHarvest # all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: print "dealing with", out if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: print "setting valid" results.append( setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier in UC.get("tiers_to_rucio_relval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_rucio_nonrelval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet else: print wfo.name, "no stats for announcing", out results.append('No Stats') # adding check for PrentageResolved flag from ReqMgr: if wfi.request[ 'RequestType'] == 'StepChain' and check_parentage_to_announce: if wfi.request['ParentageResolved']: results.append(True) else: wfi.sendLog( 'closor', "Delayed announcement of %s due to unresolved Parentage dependencies" % wfi.request['RequestName']) results.append('No ParentageResolved') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace( 'announce', 'announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") self.held.add(wfo.name)
def closor(url, specific=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'close').all() held = set() max_per_round = UC.get('max_per_round').get('closor', None) random.shuffle(wfs) if max_per_round: wfs = wfs[:max_per_round] for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced', 'normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog( 'closor', "\t%60s %d/%d = %3.2f%%" % (out, lumi_count, expected_lumis, lumi_count / float(expected_lumis) * 100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in [ 'closed-out' ]: print wfo.name, "to be announced" results = [] #'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 2 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: print "Failed DDM, retrying a second time" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor', "could not add " + out + " to DDM pool. check closor logs.", level='critical') results.append(status) if status == None: wfi.sendLog( 'closor', '%s is send to AnalysisOps DDM pool in %s copies %s' % (n_copies, out, destination_spec)) else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow is announced") else: print "ERROR with ", wfo.name, "to be announced", json.dumps( results) else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held ))) sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(held)), level='critical')
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows), "in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: wfi.sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) sendLog('injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status), level='critical') print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False ## add a check on validity of input datasets _, prim, par, sec = wfi.getIO() for d in list(prim) + list(par) + list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog( 'injector', "One of the input is not VALID. %s : %s" % (d, status_cache[d])) sendLog('injector', "One of the input of %s is not VALID. %s : %s" % (wf, d, status_cache[d]), level='critical') can_add = False ## check for any file in phedex, to verify existence _, ph_files, _, _ = getDatasetFiles(url, d) if not ph_files and not ('StoreResults' == wfi.request.setdefault( 'RequestType', None)): wfi.sendLog( 'injector', "One of the input has no file in phedex: %s" % d) sendLog('injector', "One of the input has no file in phedex: %s" % d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None) ## match keywords and technical constraints #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): # to_convert.add( wf ) # wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) # #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') for wf in to_convert: os.system( './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s' % wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget' ) wf.status = 'forget' session.commit() else: wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf) session.add(tri) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement\n%s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def singleRecovery(url, task, initial, actions, do=False): print "Inside single recovery!" payload = { "Requestor": os.getenv('USER'), "Group": 'DATAOPS', "RequestType": "Resubmission", "ACDCServer": initial['ConfigCacheUrl'], "ACDCDatabase": "acdcserver", "OriginalRequestName": initial['RequestName'], "OpenRunningTimeout": 0 } copy_over = [ 'PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion' ] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c, "not in the initial payload" #a massage ? boost the recovery over the initial wf # payload['RequestPriority'] *= 10 #Max priority is 1M payload['RequestPriority'] = min(500000, payload['RequestPriority'] * 2) ## never above 500k #change parameters based on actions here if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[ action] != 'Same': #if multicore parameter is also used, need to scale memory by the new number of cores if 'multicore' in actions and actions['multicore'] != "": continue ## Taskchains needs to be treated special to set the memory to all tasks set_to = int(actions[action]) if 'TaskChain' in initial: mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault(tname, payload[t]['Memory']) mem_dict[tname] = set_to else: break payload['Memory'] = mem_dict print "Memory set to: ", json.dumps(mem_dict, indent=2) else: payload['Memory'] = set_to print "Memory set to: ", set_to if action.startswith('multicore') and actions[action] != "": set_to = int(actions[action]) ## Taskchains needs to be treated special to set the multicore and memory values to all tasks if 'TaskChain' in initial: mem_dict = payload['Memory'] if type( payload['Memory']) == dict else {} core_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault(tname, payload[t]['Memory']) #Need to scale the memory by the new number of cores initial_cores = payload[t].setdefault( 'Multicore', 1) if 'memory' in actions and actions[ 'memory'] != "" and actions[ 'memory'] != 'Same': mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int((1 - fraction_constant) * mem / float(initial_cores)) mem_dict[tname] = int(mem + (set_to - initial_cores) * mem_per_core_c) core_dict[tname] = set_to print "For ", t print "Multicore set to ", set_to print "Memory set to ", mem_dict[tname] else: break payload['Memory'] = mem_dict payload['Multicore'] = core_dict else: #Need to scale the memory by the new number of cores initial_cores = initial.setdefault('Multicore', 1) mem = payload['Memory'] if 'memory' in actions and actions[ 'memory'] != "" and actions['memory'] != 'Same': mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(initial_cores)) payload['Multicore'] = set_to payload['Memory'] = int(mem + (set_to - initial_cores) * mem_per_core_c) print "Multicore set to ", set_to print "Memory set to ", payload['Memory'] if action.startswith('split'): split_alert = (initial['RequestType'] in ['MonteCarlo']) for key in initial: if key == 'SplittingAlgo' and (initial[key] in ['EventBased']): split_alert = True elif key.startswith('Task') and key != 'TaskChain': for key2 in initial[key]: if key2 == 'TaskName': this_taskname = initial[key][key2] recover_task = task.split('/')[-1] print "For recovery of task", recover_task print "Looking at task", this_taskname if (recover_task == this_taskname) and ( initial[key]['SplittingAlgo'] in ['EventBased']): ## the task to be recovered is actually of the wrong type to allow change of splitting sendLog( 'actor', 'To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.' % (task, initial[key]['SplittingAlgo']), level='critical') ## do not send an alert and stop the acdc #split_alert = True if split_alert: sendLog('actor', 'Cannot change splitting for %s' % initial['RequestName'], level='critical') print "I should not be doing splitting for this type of request", initial[ 'RequestName'] return None acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 initial_string = initial_string.replace('ACDC_', '').replace( 'ACDC%d_' % (acdc_round - 1), '') payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps(payload, indent=2) return None print "ACDC payload" # print json.dumps( payload , indent=2) print actions ## submit here acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for", initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for", initial["RequestName"] sendLog('actor', 'Failed twice in making ACDCs for %s!' % initial['RequestName'], level='critical') return None ## change splitting if requested if actions: for action in actions: if action.startswith('split'): acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[action] != 'max': factor = int( actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] if split['splitAlgo'] in ['EventBased']: sendLog( 'actor', "Changing the splitting on %s for %s is not permitted. Not changing." % (split['splitAlgo'], initial["RequestName"]), level='critical') continue for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: print "Changing %s (%d) by a factor %d" % ( act, split_par[act], factor), split_par[act] /= factor print "to", split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: print "Changing %s (%d) " % (act, split_par[act]), split_par[act] = 1 print "to max splitting ", split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) print "changing the splitting of", acdc print json.dumps(splittings, indent=2) done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings) ## check on done == True data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def batchor( url ): UC = unifiedConfiguration() SI = global_SI() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') ) wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] #by_campaign[wf['Campaign']].add( wf['RequestName'] ) by_campaign[wf['Campaign']].add( wf['PrepID'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] #by_hi_campaign[wf['Campaign']].add( wf['RequestName'] ) by_hi_campaign[wf['Campaign']].add( wf['PrepID'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial" : "T1_US_FNAL_MSS", "custodial_override" : ["DQMIO"], "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} batches = json.loads( open('batches.json').read() ) relval_routing = UC.get('relval_routing') def pick_one_site( p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1: choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice( choose_from ) print "picked",picked,"from",choose_from p["parameters"]["SiteWhitelist"] = [picked] for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_setup ) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword",key print "with",augment_with setup = deep_update( setup, augment_with ) #if 'cc7' in campaign: setup["parameters"]["SiteWhitelist"] = ["T2_US_Nebraska"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] )) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"]) setup["parameters"]["SiteWhitelist"]=[ hi_site ] #setup["parameters"]["SiteWhitelist"]=["T1_DE_KIT","T1_FR_CCIN2P3"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] )) open('batches.json','w').write( json.dumps( batches , indent=2 ) ) ## open the campaign configuration campaigns = json.loads( open('campaigns.relval.json').read() ) ## protect for overwriting ?? for new_campaign in list(set(add_on.keys())-set(campaigns.keys())): ## this is new, and can be announced as such print new_campaign,"is new stuff" subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor',text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in campaigns.keys(): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) is_batch_done = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable campaigns.pop( old_campaign ) ## or just drop it all together ? print "batch",old_campaign," configuration was removed" ## merge all anyways campaigns.update( add_on ) ## write it out for posterity open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2)) ## read back rread = json.loads(open('campaigns.json.updated').read()) os.system('mv campaigns.json.updated campaigns.relval.json')
def actor(url, options=None): mlock = moduleLock(wait=False, silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor', "%s and children are rejected" % wfname) else: wfi.sendLog('actor', "Failed to reject the request and dependents") sendLog('actor', 'Failed to reject the familly of %s' % wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned into %s" % (wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog( 'actor', 'Force-completing from workflow traffic controler request') WI.add(action='force', keyword=wfname, user=action_list[wfname].get('user', 'unified')) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') WI.add(action='hold', keyword=wfname, user=action_list[wfname].get('user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') continue except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because there is no acdc doc for it anyways." % (fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in where_to_run[fulltaskname] ])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s" % ( action_list[wfname].get('user', 'unified'), task.split('/')[-1], json.dumps(actions), ) reason = action_list[wfname].get('Reason', None) if reason: jira_comment += '\ndue to: %s' % (reason) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s needs to be assigned by hand" % (acdc)) continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s failed to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s failed to get assigned for recovery" % acdc) else: wfi.sendLog('actor', "%s was assigned for recovery" % acdc) recovering.add(acdc) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find( {'prepid': wfi.request['PrepID']}) if len(jiras) == 1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog( 'actor', 'not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock' % (base_eos_dir, os.getpid())) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual', 'recovering') session.commit() if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def injector(url, options, specific): use_mcm = True up = componentInfo( mcm = use_mcm, soft=['mcm'] ) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table print len(workflows),"in line" cannot_inject = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) #wl = getWorkLoad(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match # print wfi.request familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: #req_familly = getWorkflowById( url, wl['PrepID']) #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly] pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d])) can_add = False if not can_add: continue wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='warning') ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove( wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid,"got",new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid<0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement, %s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')