def changeSplittingWorkflow(url, workflow, split, task, split_type='EventAwareLumi'): if split_type == 'EventAwareLumi': params = getEventAwareLumiParams(split) elif split_type == 'Event': params = getEventBasedParams(split) elif split_type == 'Lumi': params = getLumiBasedParams(split) elif split_type == 'Merge': params = getMergeParams(split) params['requestName'] = workflow params['splittingTask'] = '/%s/%s'%(workflow, task) #pprint(params) data = reqMgrClient.setWorkflowSplitting(url, params) #TODO validate data print data
def changeSplittingWorkflow(url, workflow, split, task, split_type='EventAwareLumi'): if split_type == 'EventAwareLumi': params = getEventAwareLumiParams(split) elif split_type == 'Event': params = getEventBasedParams(split) elif split_type == 'Lumi': params = getLumiBasedParams(split) elif split_type == 'Merge': params = getMergeParams(split) params['requestName'] = workflow params['splittingTask'] = '/%s/%s' % (workflow, task) #pprint(params) data = reqMgrClient.setWorkflowSplitting(url, params) #TODO validate data print data
def singleRecovery(url, task , initial, actions, do=False): payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : "https://cmsweb.cern.ch/couchdb", "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'] } copy_over = ['PrepID','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ] for c in copy_over: payload[c] = copy.deepcopy(initial[c]) if actions: for action in actions: #if action.startswith('split'): # factor = int(action.split('-')[-1]) if '-' in action else 2 # print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor) # ## mention it's taking 2 times longer to have a 2 times finer splitting # payload['TimePerEvent'] = factor*payload['TimePerEvent'] if action.startswith('mem'): increase = int(action.split('-')[-1]) if '-' in action else 1000 ## increase the memory requirement by 1G payload['Memory'] += increase if payload['RequestString'].startswith('ACDC'): print "This is not allowed yet" return None payload['RequestString'] = 'ACDC_'+payload['RequestString'] payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None ## submit response = reqMgrClient.submitWorkflow(url, payload) m = re.search("details\/(.*)\'",response) if not m: print "Error in making ACDC for",initial["RequestName"] print response response = reqMgrClient.submitWorkflow(url, payload) m = re.search("details\/(.*)\'",response) if not m: print "Error twice in making ACDC for",initial["RequestName"] print response return None acdc = m.group(1) ## perform modifications if actions: for action in actions: if action.startswith('split'): factor = int(action.split('-')[-1]) if '-' in action else 2 acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, split ) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def singleRecovery(url, task, initial, actions, do=False): payload = { "Requestor": os.getenv('USER'), "Group": 'DATAOPS', "RequestType": "Resubmission", "ACDCServer": initial['ConfigCacheUrl'], "ACDCDatabase": "acdcserver", "OriginalRequestName": initial['RequestName'], "OpenRunningTimeout": 0 } copy_over = [ 'PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion' ] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c, "not in the initial payload" #a massage ? boost the recovery over the initial wf payload['RequestPriority'] *= 2 payload['RequestPriority'] = min(500000, payload['RequestPriority']) if actions: for action in actions: #if action.startswith('split'): # factor = int(action.split('-')[-1]) if '-' in action else 2 # print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor) # ## mention it's taking 2 times longer to have a 2 times finer splitting # payload['TimePerEvent'] = factor*payload['TimePerEvent'] if action.startswith('mem'): arg = action.split('-', 1)[-1] increase = set_to = None tasks, set_to = arg.split(':') if ':' in arg else (None, arg) tasks = tasks.split(',') if tasks else [] if set_to.startswith('+'): increase = int(set_to[1:]) else: set_to = int(set_to) ## increase the memory requirement by 1G if 'TaskChain' in initial: mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault(tname, payload[t]['Memory']) if tasks and not tname in tasks: print tname, "not concerned" continue if set_to: mem_dict[tname] = set_to else: mem_dict[tname] += increase else: break payload['Memory'] = mem_dict else: payload['Memory'] = set_to #increase = int(action.split('-')[-1]) if '-' in action else 1000 ## increase the memory requirement by 1G #payload['Memory'] += increase if action.startswith('split') and ( initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])): print "I should not be doing splitting for this type of request", initial[ 'RequestName'] return None if action.startswith('core'): arg = action.split('-', 1)[-1] tasks, set_to = arg.split(':') if ':' in arg else (None, arg) tasks = tasks.split(',') if tasks else [] set_to = int(set_to) if 'TaskChain' in initial: core_dict = {} mem_dict = payload['Memory'] if type( payload['Memory']) == dict else {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mcore = core_dict.setdefault( tname, payload[t]['Multicore']) mem = mem_dict.setdefault(tname, payload[t]['Memory']) if tasks and not tname in tasks: print tname, "not concerned" continue factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(mcore)) ##scale the memory mem_dict[tname] += (set_to - mcore) * mem_per_core_c ## scale time/event time_dict[ tname] = payload[t]['TimePerEvent'] / factor ## set the number of cores core_dict[tname] = set_to else: break payload['Multicore'] = core_dict ##payload['TimePerEvent'] = time_dict ## cannot be used yet else: payload['Multicore'] = increase acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 #print acdc_round #print "This is not allowed yet" #return None initial_string = initial_string.replace('ACDC_', '').replace( 'ACDC%d_' % (acdc_round - 1), '') payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps(payload, indent=2) return None print "ACDC payload" print json.dumps(payload, indent=2) print actions ## submit acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for", initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for", initial["RequestName"] return None ## perform modifications if actions: for action in actions: if action.startswith('split'): factor = int(action.split('-')[-1]) if '-' in action else 2 acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() for split in splittings: for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split: print "Changing %s (%d) by a factor %d" % ( act, split[act], factor), split[act] /= factor print "to", split[act] break split['requestName'] = acdc print "changing the splitting of", acdc print json.dumps(split, indent=2) print reqMgrClient.setWorkflowSplitting(url, acdc, split) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleRecovery(url, task, initial, actions, do=False): print "Inside single recovery!" payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : initial['CouchURL'], "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'], "OpenRunningTimeout" : 0 } copy_over = ['PrepID','Campaign','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion'] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c,"not in the initial payload" #a massage ? boost the recovery over the initial wf # payload['RequestPriority'] *= 10 #Max priority is 1M payload['RequestPriority'] = min(500000, payload['RequestPriority']*2 ) ## never above 500k #change parameters based on actions here if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': payload['Memory'] = actions[action] print "Memory set to " + actions[action] ## Taskchains needs to be treated special to set the memory to all tasks if 'TaskChain' in initial: it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: payload[t] = copy.deepcopy(initial[t]) payload[t]['Memory'] = actions[action] else: break if action.startswith('split'): split_alert = (initial['RequestType'] in ['MonteCarlo'] ) for key in initial: if key == 'SplittingAlgo' and (initial[key] in ['EventBased']): split_alert = True elif key.startswith('Task') and key != 'TaskChain': for key2 in initial[key]: if key2 == 'TaskName': print "task",task.split('/')[-1] print "TaskName",initial[key][key2] if (initial[key][key2] == task) and (initial[key][key2] in ['EventBased']): split_alert = True if split_alert: sendLog('actor','Cannot change splitting for %s'%initial['RequestName'],level='warning') print "I should not be doing splitting for this type of request",initial['RequestName'] return None acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'') payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None print "ACDC payload" # print json.dumps( payload , indent=2) print actions ## submit here acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for",initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for",initial["RequestName"] sendLog('actor','Failed twice in making ACDCs for %s!'%initial['RequestName'],level='critical') return None ## change splitting if requested if actions: for action in actions: if action.startswith('split'): acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() if actions[action] != 'Same' and actions[action] != 'max': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) elif 'max' in actions[action]: for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) "%( act, split[act]), split[act] = 1 print "to max splitting ",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) +1 else: payload['ProcessingVersion'] = 2 ## drop parameters on the way to reqmgr2 paramBlacklist = ['BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime', 'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard', 'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize', 'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType', 'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus', 'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions', 'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists', 'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles','checkbox', 'DN', 'AutoApproveSubscriptionSites', 'NonCustodialSites', 'CustodialSites', 'OriginalRequestName', 'Teams', 'OutputModulesLFNBases', 'SiteBlacklist', 'AllowOpportunistic', '_id'] for p in paramBlacklist: if p in payload: payload.pop( p ) pass if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" it=1 while True: t = 'Task%d'%it it+=1 if t in payload: payload[t]['Memory'] = actions[action] print "Memory set for Task%d"%it else: break else: print "Setting memory for non-taskchain workflow" payload['Memory'] = actions[action] print "Memory set to " + actions[action] #This line is doesn't work for some reason # wfi.sendLog('actor','Memory of clone set to %d'%actions[action]) print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for",initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for",initial["RequestName"] sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"],level='critical') wfi.sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittings() if actions[action] != 'Same' and actions[action] != 'max' and actions[action] != '': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: wfi.sendLog('actor','Changing %s (%d) by a factor %d'%( act, split[act], factor)) print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = clone print "changing the splitting of",clone print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: wfi.sendLog('actor','Max splitting set for %s (%d'%( act, split[act])) print "Changing %s (%d) "%( act, split[act]), split[act] = 1 print "to max splitting ",split[act] break split['requestName'] = clone print "changing the splitting of",clone print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, clone, split ) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) wfi.sendLog('actor','Cloned into %s'%clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) +1 else: payload['ProcessingVersion'] = 2 payload = reqMgrClient.purgeClonedSchema( payload ) if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" mem_dict = {} it=1 while True: t = 'Task%d'%it it+=1 if t in payload: tname = payload[t]['TaskName'] mem_dict[tname] = int(actions[action]) print "Memory set for Task%d"%it else: break payload['Memory'] = mem_dict else: print "Setting memory for non-taskchain workflow" payload['Memory'] = int(actions[action]) print "Memory set to " + actions[action] print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for",initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for",initial["RequestName"] sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"],level='critical') wfi.sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[action] != 'max' and actions[action] != '': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split_par: wfi.sendLog('actor','Changing %s (%d) by a factor %d'%( act, split_par[act], factor)) split_par[act] /= factor print "to",split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split_par: wfi.sendLog('actor','Max splitting set for %s (%d'%( act, split_par[act])) print "Changing %s (%d) "%( act, split_par[act]), split_par[act] = 1 print "to max splitting ",split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) print "changing the splitting of",clone print json.dumps( splittings, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, clone, splittings ) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) #wfi.sendLog('actor','Cloned into %s'%clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
def singleRecovery(url, task , initial, actions, do=False): payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : initial['ConfigCacheUrl'], "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'], "OpenRunningTimeout" : 0 } copy_over = ['PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion'] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c,"not in the initial payload" #a massage ? boost the recovery over the initial wf payload['RequestPriority'] *= 2 payload['RequestPriority'] = min(500000, payload['RequestPriority']) if actions: for action in actions: #if action.startswith('split'): # factor = int(action.split('-')[-1]) if '-' in action else 2 # print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor) # ## mention it's taking 2 times longer to have a 2 times finer splitting # payload['TimePerEvent'] = factor*payload['TimePerEvent'] if action.startswith('mem'): arg = action.split('-',1)[-1] increase = set_to = None tasks,set_to = arg.split(':') if ':' in arg else (None,arg) tasks = tasks.split(',') if tasks else [] if set_to.startswith('+'): increase = int(set_to[1:]) else: set_to = int(set_to) ## increase the memory requirement by 1G if 'TaskChain' in initial: mem_dict = {} it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault( tname, payload[t]['Memory']) if tasks and not tname in tasks: print tname,"not concerned" continue if set_to: mem_dict[tname] = set_to else: mem_dict[tname] += increase else: break payload['Memory'] = mem_dict else: payload['Memory'] = set_to #increase = int(action.split('-')[-1]) if '-' in action else 1000 ## increase the memory requirement by 1G #payload['Memory'] += increase if action.startswith('split') and (initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])): print "I should not be doing splitting for this type of request",initial['RequestName'] return None if action.startswith('core'): arg = action.split('-',1)[-1] tasks,set_to = arg.split(':') if ':' in arg else (None,arg) tasks = tasks.split(',') if tasks else [] set_to = int(set_to) if 'TaskChain' in initial: core_dict = {} mem_dict = payload['Memory'] if type(payload['Memory'])==dict else {} it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mcore = core_dict.setdefault(tname, payload[t]['Multicore']) mem = mem_dict.setdefault(tname, payload[t]['Memory']) if tasks and not tname in tasks: print tname,"not concerned" continue factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int((1-fraction_constant) * mem / float(mcore)) ##scale the memory mem_dict[tname] += (set_to-mcore)*mem_per_core_c ## scale time/event time_dict[tname] = payload[t]['TimePerEvent'] /factor ## set the number of cores core_dict[tname] = set_to else: break payload['Multicore'] = core_dict ##payload['TimePerEvent'] = time_dict ## cannot be used yet else: payload['Multicore'] = increase acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 #print acdc_round #print "This is not allowed yet" #return None initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'') payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None print "ACDC payload" print json.dumps( payload , indent=2) print actions ## submit acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for",initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for",initial["RequestName"] return None ## perform modifications if actions: for action in actions: if action.startswith('split'): factor = int(action.split('-')[-1]) if '-' in action else 2 acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) + 1 else: payload['ProcessingVersion'] = 2 ## drop parameters on the way to reqmgr2 paramBlacklist = [ 'BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime', 'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard', 'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize', 'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType', 'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus', 'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions', 'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists', 'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles', 'checkbox', 'DN', 'AutoApproveSubscriptionSites', 'NonCustodialSites', 'CustodialSites', 'OriginalRequestName', 'Teams', 'OutputModulesLFNBases', 'SiteBlacklist', 'AllowOpportunistic', '_id', 'Override' ] for p in paramBlacklist: if p in payload: payload.pop(p) taskParamBlacklist = ['EventsPerJob'] for i in range(1, 100): t = 'Task%s' % i if not t in payload: break for p in taskParamBlacklist: if p in payload[t]: payload[t].pop(p) if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[ action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in payload: tname = payload[t]['TaskName'] mem_dict[tname] = int(actions[action]) print "Memory set for Task%d" % it else: break payload['Memory'] = mem_dict else: print "Setting memory for non-taskchain workflow" payload['Memory'] = int(actions[action]) print "Memory set to " + actions[action] print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for", initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for", initial["RequestName"] sendLog('actor', 'Failed to make a clone twice for %s!' % initial["RequestName"], level='critical') wfi.sendLog( 'actor', 'Failed to make a clone twice for %s!' % initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[ action] != 'max' and actions[action] != '': factor = int( actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Changing %s (%d) by a factor %d' % (act, split_par[act], factor)) split_par[act] /= factor print "to", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Max splitting set for %s (%d' % (act, split_par[act])) print "Changing %s (%d) " % (act, split_par[act]), split_par[act] = 1 print "to max splitting ", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) print "changing the splitting of", clone print json.dumps(splittings, indent=2) print reqMgrClient.setWorkflowSplitting(url, clone, splittings) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) wfi.sendLog('actor', 'Cloned into %s' % clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: print "getting wf in",status wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if wfh.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled+=1 continue if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled+=1 wfh.sendLog('assignor','There is no output at all') sendLog('assignor','Workflow %s has no output at all'%( wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=rwl ) )) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, lumis=lwl))) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction( url, sec ) if sec_availability >=1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog('assignor',"The secondary %s is available %s times on disk, and usable"%( sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog('assignor',"The secondary %s is nowhere on disk"% sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"Intersecting with secondary requirement, now allowed %s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) ## if they are requested for processing, they should bbe all closed already closeAllBlocks(url, prim, blocks) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction(url, prim, only_blocks = blocks) sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] if primary_aaa: sites_all_data = set() for (psite,(there,frac)) in presence.items(): if there: sites_all_data.update( SI.SE_to_CEs(psite) ) sites_all_data = list(sites_all_data) #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there])) sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] if primary_aaa: sites_with_any_data = set() for psite in presence.keys(): sites_with_any_data.update( SI.SE_to_CEs(psite) ) sites_with_any_data = list(sites_with_any_data) #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()])) holding_but_not_allowed = set() for se_site in presence.keys(): if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)): holding_but_not_allowed.add( se_site ) #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted( holding_but_not_allowed )) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) isStoreResults = ( 'StoreResults' == wfh.request.setdefault('RequestType',None) ) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled+= 1 wfh.sendLog('assignor',"Cannot assign StoreResults request because MergedLFN is missing") sendLog('assignor','Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog('assignor',"Cannot assign StoreResults request because SiteWhitelist is missing") sendLog('assignor','Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v def pick_campaign( assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update( assign_parameters.get('parameters',{}) ) if options.force_options: pick_campaign( assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign( assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog('assignor','Holding on to the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor','Change of splitting is on hold') n_stalled+=1 continue if split_check==None or split_check==False: n_stalled+=1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog('assignor','Applying the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list(set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites',[]))) result = reqMgrClient.assignWorkflow(url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock( secure, reason = 'assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor',"%s workflows cannot be assigned. Please take a look"%(n_stalled), level='critical')
def singleRecovery(url, task, initial, actions, do=False): payload = { "Requestor": os.getenv('USER'), "Group": 'DATAOPS', "RequestType": "Resubmission", "ACDCServer": initial['CouchURL'], "ACDCDatabase": "acdcserver", "OriginalRequestName": initial['RequestName'] } copy_over = [ 'PrepID', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion' ] for c in copy_over: payload[c] = copy.deepcopy(initial[c]) #a massage ? boost the recovery over the initial wf payload['RequestPriority'] *= 10 if actions: for action in actions: #if action.startswith('split'): # factor = int(action.split('-')[-1]) if '-' in action else 2 # print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor) # ## mention it's taking 2 times longer to have a 2 times finer splitting # payload['TimePerEvent'] = factor*payload['TimePerEvent'] if action.startswith('mem'): increase = int( action.split('-')[-1]) if '-' in action else 1000 ## increase the memory requirement by 1G payload['Memory'] += increase if action.startswith('split') and ( initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])): print "I should not be doing splitting for this type of request", initial[ 'RequestName'] return None if payload['RequestString'].startswith('ACDC'): print "This is not allowed yet" return None payload['RequestString'] = 'ACDC_' + payload['RequestString'] payload['InitialTaskPath'] = task if not do: print json.dumps(payload, indent=2) return None print json.dumps(payload, indent=2) ## submit acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for", initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for", initial["RequestName"] return None ## perform modifications if actions: for action in actions: if action.startswith('split'): factor = int(action.split('-')[-1]) if '-' in action else 2 acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() for split in splittings: for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split: print "Changing %s (%d) by a factor %d" % ( act, split[act], factor), split[act] /= factor print "to", split[act] break split['requestName'] = acdc print "changing the splitting of", acdc print json.dumps(split, indent=2) print reqMgrClient.setWorkflowSplitting(url, acdc, split) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleRecovery(url, task, initial, actions, do=False): print "Inside single recovery!" payload = { "Requestor": os.getenv('USER'), "Group": 'DATAOPS', "RequestType": "Resubmission", "ACDCServer": initial['ConfigCacheUrl'], "ACDCDatabase": "acdcserver", "OriginalRequestName": initial['RequestName'], "OpenRunningTimeout": 0 } copy_over = [ 'PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion' ] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c, "not in the initial payload" #a massage ? boost the recovery over the initial wf # payload['RequestPriority'] *= 10 #Max priority is 1M payload['RequestPriority'] = min(500000, payload['RequestPriority'] * 2) ## never above 500k #change parameters based on actions here if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[ action] != 'Same': #if multicore parameter is also used, need to scale memory by the new number of cores if 'multicore' in actions and actions['multicore'] != "": continue ## Taskchains needs to be treated special to set the memory to all tasks set_to = int(actions[action]) if 'TaskChain' in initial: mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault(tname, payload[t]['Memory']) mem_dict[tname] = set_to else: break payload['Memory'] = mem_dict print "Memory set to: ", json.dumps(mem_dict, indent=2) else: payload['Memory'] = set_to print "Memory set to: ", set_to if action.startswith('multicore') and actions[action] != "": set_to = int(actions[action]) ## Taskchains needs to be treated special to set the multicore and memory values to all tasks if 'TaskChain' in initial: mem_dict = payload['Memory'] if type( payload['Memory']) == dict else {} core_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault(tname, payload[t]['Memory']) #Need to scale the memory by the new number of cores initial_cores = payload[t].setdefault( 'Multicore', 1) if 'memory' in actions and actions[ 'memory'] != "" and actions[ 'memory'] != 'Same': mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int((1 - fraction_constant) * mem / float(initial_cores)) mem_dict[tname] = int(mem + (set_to - initial_cores) * mem_per_core_c) core_dict[tname] = set_to print "For ", t print "Multicore set to ", set_to print "Memory set to ", mem_dict[tname] else: break payload['Memory'] = mem_dict payload['Multicore'] = core_dict else: #Need to scale the memory by the new number of cores initial_cores = initial.setdefault('Multicore', 1) mem = payload['Memory'] if 'memory' in actions and actions[ 'memory'] != "" and actions['memory'] != 'Same': mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(initial_cores)) payload['Multicore'] = set_to payload['Memory'] = int(mem + (set_to - initial_cores) * mem_per_core_c) print "Multicore set to ", set_to print "Memory set to ", payload['Memory'] if action.startswith('split'): split_alert = (initial['RequestType'] in ['MonteCarlo']) for key in initial: if key == 'SplittingAlgo' and (initial[key] in ['EventBased']): split_alert = True elif key.startswith('Task') and key != 'TaskChain': for key2 in initial[key]: if key2 == 'TaskName': this_taskname = initial[key][key2] recover_task = task.split('/')[-1] print "For recovery of task", recover_task print "Looking at task", this_taskname if (recover_task == this_taskname) and ( initial[key]['SplittingAlgo'] in ['EventBased']): ## the task to be recovered is actually of the wrong type to allow change of splitting sendLog( 'actor', 'To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.' % (task, initial[key]['SplittingAlgo']), level='critical') ## do not send an alert and stop the acdc #split_alert = True if split_alert: sendLog('actor', 'Cannot change splitting for %s' % initial['RequestName'], level='critical') print "I should not be doing splitting for this type of request", initial[ 'RequestName'] return None acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 initial_string = initial_string.replace('ACDC_', '').replace( 'ACDC%d_' % (acdc_round - 1), '') payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps(payload, indent=2) return None print "ACDC payload" # print json.dumps( payload , indent=2) print actions ## submit here acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for", initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for", initial["RequestName"] sendLog('actor', 'Failed twice in making ACDCs for %s!' % initial['RequestName'], level='critical') return None ## change splitting if requested if actions: for action in actions: if action.startswith('split'): acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[action] != 'max': factor = int( actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] if split['splitAlgo'] in ['EventBased']: sendLog( 'actor', "Changing the splitting on %s for %s is not permitted. Not changing." % (split['splitAlgo'], initial["RequestName"]), level='critical') continue for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: print "Changing %s (%d) by a factor %d" % ( act, split_par[act], factor), split_par[act] /= factor print "to", split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: print "Changing %s (%d) " % (act, split_par[act]), split_par[act] = 1 print "to max splitting ", split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) print "changing the splitting of", acdc print json.dumps(splittings, indent=2) done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings) ## check on done == True data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleRecovery(url, task , initial, actions, do=False): payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : initial['CouchURL'], "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'] } copy_over = ['PrepID','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion'] for c in copy_over: payload[c] = copy.deepcopy(initial[c]) #a massage ? boost the recovery over the initial wf payload['RequestPriority'] *= 10 if actions: for action in actions: #if action.startswith('split'): # factor = int(action.split('-')[-1]) if '-' in action else 2 # print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor) # ## mention it's taking 2 times longer to have a 2 times finer splitting # payload['TimePerEvent'] = factor*payload['TimePerEvent'] if action.startswith('mem'): increase = int(action.split('-')[-1]) if '-' in action else 1000 ## increase the memory requirement by 1G payload['Memory'] += increase if action.startswith('split') and (initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])): print "I should not be doing splitting for this type of request",initial['RequestName'] return None if payload['RequestString'].startswith('ACDC'): print "This is not allowed yet" return None payload['RequestString'] = 'ACDC_'+payload['RequestString'] payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None print json.dumps( payload , indent=2) ## submit acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for",initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for",initial["RequestName"] return None ## perform modifications if actions: for action in actions: if action.startswith('split'): factor = int(action.split('-')[-1]) if '-' in action else 2 acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittings() for split in splittings: for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split: print "Changing %s (%d) by a factor %d"%( act, split[act], factor), split[act] /= factor print "to",split[act] break split['requestName'] = acdc print "changing the splitting of",acdc print json.dumps( split, indent=2 ) print reqMgrClient.setWorkflowSplitting(url, acdc, split ) data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc
def singleClone(url, wfname, actions, comment, do=False): wfi = workflowInfo(url, wfname) payload = wfi.getSchema() initial = wfi.request payload['Requestor'] = os.getenv('USER') payload['Group'] = 'DATAOPS' payload['OriginalRequestName'] = initial['RequestName'] payload['RequestPriority'] = initial['RequestPriority'] if 'ProcessingVersion' in initial: payload['ProcessingVersion'] = int(initial['ProcessingVersion']) + 1 else: payload['ProcessingVersion'] = 2 payload = reqMgrClient.purgeClonedSchema(payload) if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[ action] != 'Same': if 'TaskChain' in payload: print "Setting memory for clone of task chain" mem_dict = {} it = 1 while True: t = 'Task%d' % it it += 1 if t in payload: tname = payload[t]['TaskName'] mem_dict[tname] = int(actions[action]) print "Memory set for Task%d" % it else: break payload['Memory'] = mem_dict else: print "Setting memory for non-taskchain workflow" payload['Memory'] = int(actions[action]) print "Memory set to " + actions[action] print "Clone payload" # print json.dumps( payload , indent=2) print actions #Create clone clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error in making clone for", initial["RequestName"] clone = reqMgrClient.submitWorkflow(url, payload) if not clone: print "Error twice in making clone for", initial["RequestName"] sendLog('actor', 'Failed to make a clone twice for %s!' % initial["RequestName"], level='critical') wfi.sendLog( 'actor', 'Failed to make a clone twice for %s!' % initial["RequestName"]) return None if actions: for action in actions: if action.startswith('split'): cloneinfo = workflowInfo(url, clone) splittings = cloneinfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[ action] != 'max' and actions[action] != '': factor = int( actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Changing %s (%d) by a factor %d' % (act, split_par[act], factor)) split_par[act] /= factor print "to", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in [ 'avg_events_per_job', 'events_per_job', 'lumis_per_job' ]: if act in split_par: wfi.sendLog( 'actor', 'Max splitting set for %s (%d' % (act, split_par[act])) print "Changing %s (%d) " % (act, split_par[act]), split_par[act] = 1 print "to max splitting ", split_par[act] break #split['requestName'] = clone #print "changing the splitting of",clone #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, clone, split ) print "changing the splitting of", clone print json.dumps(splittings, indent=2) print reqMgrClient.setWorkflowSplitting(url, clone, splittings) #Approve data = reqMgrClient.setWorkflowApproved(url, clone) #wfi.sendLog('actor','Cloned into %s'%clone) # wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment )) # wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False) print data return clone
def singleRecovery(url, task, initial, actions, do=False): print "Inside single recovery!" payload = { "Requestor" : os.getenv('USER'), "Group" : 'DATAOPS', "RequestType" : "Resubmission", "ACDCServer" : initial['ConfigCacheUrl'], "ACDCDatabase" : "acdcserver", "OriginalRequestName" : initial['RequestName'], "OpenRunningTimeout" : 0 } copy_over = ['PrepID','Campaign','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion'] for c in copy_over: if c in initial: payload[c] = copy.deepcopy(initial[c]) else: print c,"not in the initial payload" #a massage ? boost the recovery over the initial wf # payload['RequestPriority'] *= 10 #Max priority is 1M payload['RequestPriority'] = min(500000, payload['RequestPriority']*2 ) ## never above 500k #change parameters based on actions here if actions: for action in actions: if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same': #if multicore parameter is also used, need to scale memory by the new number of cores if 'multicore' in actions and actions['multicore'] != "": continue ## Taskchains needs to be treated special to set the memory to all tasks set_to = int(actions[action]) if 'TaskChain' in initial: mem_dict = {} it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault( tname, payload[t]['Memory']) mem_dict[tname] = set_to else: break payload['Memory'] = mem_dict print "Memory set to: ",json.dumps( mem_dict, indent=2) else: payload['Memory'] = set_to print "Memory set to: ", set_to if action.startswith('multicore') and actions[action] != "": set_to = int(actions[action] ) ## Taskchains needs to be treated special to set the multicore and memory values to all tasks if 'TaskChain' in initial: mem_dict = payload['Memory'] if type(payload['Memory'])==dict else {} core_dict = {} it = 1 while True: t = 'Task%d'%it it += 1 if t in initial: tname = payload.setdefault(t, initial[t])['TaskName'] mem = mem_dict.setdefault( tname, payload[t]['Memory']) #Need to scale the memory by the new number of cores initial_cores = payload[t].setdefault('Multicore', 1) if 'memory' in actions and actions['memory'] != "" and actions['memory'] != 'Same': mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int (( 1 - fraction_constant) * mem / float(initial_cores) ) mem_dict[tname] = int ( mem + (set_to - initial_cores)*mem_per_core_c ) core_dict[tname] = set_to print "For ", t print "Multicore set to ", set_to print "Memory set to ", mem_dict[tname] else: break payload['Memory'] = mem_dict payload['Multicore'] = core_dict else: #Need to scale the memory by the new number of cores initial_cores = initial.setdefault('Multicore', 1) mem = payload['Memory'] if 'memory' in actions and actions['memory'] != "" and actions['memory'] != 'Same' : mem = actions['memory'] fraction_constant = 0.4 mem_per_core_c = int (( 1 - fraction_constant) * mem / float(initial_cores) ) payload['Multicore'] = set_to payload['Memory'] = int ( mem + (set_to - initial_cores)*mem_per_core_c ) print "Multicore set to ", set_to print "Memory set to ", payload['Memory'] if action.startswith('split'): split_alert = (initial['RequestType'] in ['MonteCarlo'] ) for key in initial: if key == 'SplittingAlgo' and (initial[key] in ['EventBased']): split_alert = True elif key.startswith('Task') and key != 'TaskChain': for key2 in initial[key]: if key2 == 'TaskName': this_taskname = initial[key][key2] recover_task = task.split('/')[-1] print "For recovery of task",recover_task print "Looking at task",this_taskname if (recover_task == this_taskname) and (initial[key]['SplittingAlgo'] in ['EventBased']): ## the task to be recovered is actually of the wrong type to allow change of splitting sendLog('actor','To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.'%( task, initial[key]['SplittingAlgo']), level='critical') ## do not send an alert and stop the acdc #split_alert = True if split_alert: sendLog('actor','Cannot change splitting for %s'%initial['RequestName'],level='critical') print "I should not be doing splitting for this type of request",initial['RequestName'] return None acdc_round = 0 initial_string = payload['RequestString'] if initial_string.startswith('ACDC'): if initial_string[4].isdigit(): acdc_round = int(initial_string[4]) acdc_round += 1 initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'') payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string) payload['InitialTaskPath'] = task if not do: print json.dumps( payload, indent=2) return None print "ACDC payload" # print json.dumps( payload , indent=2) print actions ## submit here acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error in making ACDC for",initial["RequestName"] acdc = reqMgrClient.submitWorkflow(url, payload) if not acdc: print "Error twice in making ACDC for",initial["RequestName"] sendLog('actor','Failed twice in making ACDCs for %s!'%initial['RequestName'],level='critical') return None ## change splitting if requested if actions: for action in actions: if action.startswith('split'): acdcInfo = workflowInfo(url, acdc) splittings = acdcInfo.getSplittingsNew(strip=True) if actions[action] != 'Same' and actions[action] != 'max': factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2 for split in splittings: split_par = split['splitParams'] if split['splitAlgo'] in ['EventBased']: sendLog('actor',"Changing the splitting on %s for %s is not permitted. Not changing."%(split['splitAlgo'],initial["RequestName"]), level='critical') continue for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split_par: print "Changing %s (%d) by a factor %d"%( act, split_par[act], factor), split_par[act] /= factor print "to",split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) elif 'max' in actions[action]: for split in splittings: split_par = split['splitParams'] for act in ['avg_events_per_job','events_per_job','lumis_per_job']: if act in split_par: print "Changing %s (%d) "%( act, split_par[act]), split_par[act] = 1 print "to max splitting ",split_par[act] break #split['requestName'] = acdc #print "changing the splitting of",acdc #print json.dumps( split, indent=2 ) #print reqMgrClient.setWorkflowSplitting(url, acdc, split ) print "changing the splitting of",acdc print json.dumps( splittings, indent=2 ) done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings ) ## check on done == True data = reqMgrClient.setWorkflowApproved(url, acdc) print data return acdc