def changepswd(): current_user = get_jwt_identity() pipeline = Pipeline(request) pipeline.add(ensureJson) pipeline.add(ensureParam, [request, 'username']) pipeline.add(ensureParam, [request, 'oripswd']) pipeline.add(ensureParam, [request, 'newpswd']) broken, retvs = pipeline.run() if broken: return retvs _, username, oripswd, newpswd = retvs sess = DBSession() user = sess.query(User).filter_by(username=username).first() if not user or not cmparePswd( oripswd, user.password) or not user.id == current_user: return jsonify({"msg": "Bad username or password"}), 401 user.setPassword(newpswd) sess.commit() invalidate(get_raw_jwt()) return jsonify(msg="Change password successfully, please relogin"), 200
def actor(url, options=None): mlock = moduleLock(wait=False, silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor', "%s and children are rejected" % wfname) else: wfi.sendLog('actor', "Failed to reject the request and dependents") sendLog('actor', 'Failed to reject the familly of %s' % wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned into %s" % (wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog( 'actor', 'Force-completing from workflow traffic controler request') WI.add(action='force', keyword=wfname, user=action_list[wfname].get('user', 'unified')) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') WI.add(action='hold', keyword=wfname, user=action_list[wfname].get('user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') continue except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because there is no acdc doc for it anyways." % (fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in where_to_run[fulltaskname] ])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s" % ( action_list[wfname].get('user', 'unified'), task.split('/')[-1], json.dumps(actions), ) reason = action_list[wfname].get('Reason', None) if reason: jira_comment += '\ndue to: %s' % (reason) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s needs to be assigned by hand" % (acdc)) continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s failed to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s failed to get assigned for recovery" % acdc) else: wfi.sendLog('actor', "%s was assigned for recovery" % acdc) recovering.add(acdc) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find( {'prepid': wfi.request['PrepID']}) if len(jiras) == 1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog( 'actor', 'not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock' % (base_eos_dir, os.getpid())) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual', 'recovering') session.commit() if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc', 'jira']) #if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter( Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() if not wfs: batches = batchInfo().content() for bname in batches: if specific == bname: for pid in batches[bname]: b_wfs = getWorkflowById(url, pid) for wf in b_wfs: wfs.append( session.query(Workflow).filter( Workflow.name == wf).first()) break else: wfs = session.query(Workflow).filter( Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs), "to reject" if len(wfs) > 1: print "\n".join([wfo.name for wfo in wfs]) answer = raw_input('Reject these') if not answer.lower() in ['y', 'yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject", spec return wfi = workflowInfo(url, wfo.name) comment = "" if options.comments: comment = ", reason: " + options.comments if options.keep: wfi.sendLog( 'rejector', 'invalidating the workflow by unified operator%s' % comment) else: wfi.sendLog( 'rejector', 'invalidating the workflow and outputs by unified operator%s' % comment) results = invalidate(url, wfi, only_resub=True, with_output=(not options.keep)) if all(results): print wfo.name, "rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int( schema['ProcessingVersion'] ) + 1 ## dubious str->int conversion else: schema['ProcessingVersion'] = 2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it = 1 while True: t = 'Task%d' % it it += 1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.short_task and schema['RequestType'] == 'TaskChain': translate = {} it = 1 while True: tt = 'Task%d' % it if tt in schema: tname = schema[tt]['TaskName'] ntname = 'T%d' % it translate[tname] = ntname it += 1 schema[tt]['TaskName'] = ntname if 'InputTask' in schema[tt]: itname = schema[tt]['InputTask'] schema[tt]['InputTask'] = translate[itname] else: break for k in schema.get('ProcessingString', {}).keys(): schema['ProcessingString'][ translate[k]] = schema['ProcessingString'].pop(k) for k in schema.get('AcquisitionEra', {}).keys(): schema['AcquisitionEra'][ translate[k]] = schema['AcquisitionEra'].pop(k) if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks, set_to = options.Multicore.split( ':') if ':' in options.Multicore else ( "", options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d' % it it += 1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore', 1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int( (1 - fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += ( set_to - mcore) * mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int, options.runs.split(',')) if options.PrepID: schema['PrepID'] = options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1, ntask - 1): schema['Task%d' % it]['KeepOutput'] = False schema['TaskChain'] = ntask - 1 schema.pop('Task%d' % ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema(schema) print "submitting" if (options.to_stepchain and (schema['RequestType'] == 'TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" mcore = 0 mem = 0 schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step = 1 s_n = {} while True: if 'Task%d' % step in schema: sname = 'Step%d' % step schema[sname] = schema.pop('Task%d' % step) tmcore = schema[sname].pop('Multicore') tmem = schema[sname].pop('Memory') if mcore and tmcore != mcore: wfi.sendLog( 'rejector', 'the conversion to stepchain encoutered different value of Multicore %d != %d' % (tmcore, mcore)) sendLog( 'rejector', 'the conversion of %s to stepchain encoutered different value of Multicore %d != %d' % (wfo.name, tmcore, mcore), level='critical') mcore = max(mcore, tmcore) mem = max(mem, tmem) schema[sname]['StepName'] = schema[sname].pop( 'TaskName') s_n[schema[sname]['StepName']] = sname if 'InputTask' in schema[sname]: schema[sname]['InputStep'] = schema[sname].pop( 'InputTask') eff = 1. up_s = sname while True: ## climb up a step. supposedely already all converted up_s = s_n.get( schema[up_s].get('InputStep', None), None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get( 'FilterEfficiency', 1.) else: ## or stop there break if not 'KeepOutput' in schema[sname]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema[sname]['KeepOutput'] = False schema['TimePerEvent'] += eff * schema[sname].pop( 'TimePerEvent') schema['SizePerEvent'] += eff * schema[sname].pop( 'SizePerEvent') step += 1 else: break schema['Multicore'] = mcore schema['Memory'] = mem print json.dumps(schema, indent=2) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: msg = "Error in cloning {}".format(wfo.name) print(msg) wfi.sendLog('rejector', msg) # Get the error message time.sleep(5) data = reqMgrClient.requestManagerPost( url, "/reqmgr2/data/request", schema) wfi.sendLog('rejector', data) print json.dumps(schema, indent=2) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog( 'rejector', 'Cloned into %s by unified operator %s' % (newWorkflow, comment)) #wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s' % (comment), do_batch=False) session.commit() else: msg = "Error in rejecting {}: {}".format(wfo.name, results) print(msg) wfi.sendLog('rejector', msg)
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(soft=['wtc','jira']) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations',query='status=announced') if not invalids: return print len(invalids),"Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge= False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow",wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of",wfo.status,"to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn,"is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc successes = invalidate(url, wfi, only_resub=True, with_output=False) wfi.sendLog('invalidator',"rejection is performed from McM invalidations request") acknowledge= all(successes) text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid ) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting",dataset,"to",invalid_status success = setDatasetStatus(dataset , invalid_status ) if success: acknowledge= True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid ) else: msg = "Could not invalidate {}. Please consider contacting data management team for manual intervention.".format(dataset) print(msg) sendLog('invalidator', msg, level='critical') else: print "\t\t",invalid['type']," type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" ackno_url = '/restapi/invalidations/acknowledge/%s'%( invalid['_id'] ) print "at",ackno_url mcm.get(ackno_url) # prepare the text for batches batches = [] batches.extend(mcm.getA('batches',query='contains=%s'%batch_lookup)) batches = filter(lambda b : b['status'] in ['announced','done','reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to",bid text_to_batch[bid] += text+"\n\n" # prepare the text for requests text_to_request[pid] += text+"\n\n" for bid,text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify',{ "notes" : text, "prepid" : bid}) pass for pid,text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid]})
def invalidator(url, invalid_status='INVALID'): use_mcm = True up = componentInfo(soft=['wtc']) if not up.check(): return mcm = McMClient(dev=False) invalids = mcm.getA('invalidations', query='status=announced') if not invalids: return print len(invalids), "Object to be invalidated" text_to_batch = defaultdict(str) text_to_request = defaultdict(str) for invalid in invalids: acknowledge = False pid = invalid['prepid'] batch_lookup = invalid['prepid'] text = "" if invalid['type'] == 'request': wfn = invalid['object'] print "need to invalidate the workflow", wfn wfo = session.query(Workflow).filter(Workflow.name == wfn).first() if wfo: ## set forget of that thing (although checkor will recover from it) print "setting the status of", wfo.status, "to forget" wfo.status = 'forget' session.commit() else: ## do not go on like this, do not acknoledge it print wfn, "is set to be rejected, but we do not know about it yet" #continue wfi = workflowInfo(url, wfn) success = "not rejected" ## to do, we should find a way to reject the workflow and any related acdc successes = invalidate(url, wfi, only_resub=True, with_output=False) wfi.sendLog( 'invalidator', "rejection is performed from McM invalidations request") acknowledge = all(successes) text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid) batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name elif invalid['type'] == 'dataset': dataset = invalid['object'] if '?' in dataset: continue if 'None' in dataset: continue if 'None-' in dataset: continue if 'FAKE-' in dataset: continue print "setting", dataset, "to", invalid_status success = setDatasetStatus(dataset, invalid_status) if success: acknowledge = True text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid) else: print "invalidation of", dataset, "did not go so well" else: print "\t\t", invalid['type'], " type not recognized" if acknowledge: ## acknoldge invalidation in mcm, provided we can have the api print "acknowledgment to mcm" ackno_url = '/restapi/invalidations/acknowledge/%s' % ( invalid['_id']) print "at", ackno_url mcm.get(ackno_url) # prepare the text for batches batches = [] batches.extend( mcm.getA('batches', query='contains=%s' % batch_lookup)) batches = filter( lambda b: b['status'] in ['announced', 'done', 'reset'], batches) if len(batches): bid = batches[-1]['prepid'] print "batch nofication to", bid text_to_batch[bid] += text + "\n\n" # prepare the text for requests text_to_request[pid] += text + "\n\n" for bid, text in text_to_batch.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/batches/notify', {"notes": text, "prepid": bid}) pass for pid, text in text_to_request.items(): if not text: continue text += '\n This is an automated message' mcm.put('/restapi/requests/notify', { "message": text, "prepids": [pid] })
def logout(): invalidate(get_raw_jwt()) return jsonify(result=True, msg="Successfully logged out"), 200
def rejector(url, specific, options=None): up = componentInfo(soft=['wtc','jira']) if not up.check(): return if specific and specific.startswith('/'): ## this is for a dataset print setDatasetStatus(specific, 'INVALID') return if options.filelist: wfs = [] for line in filter(None, open(options.filelist).read().split('\n')): print line wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all()) elif specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() if not wfs: batches = batchInfo().content() for bname in batches: if specific == bname: for pid in batches[bname]: b_wfs = getWorkflowById(url, pid) for wf in b_wfs: wfs.append( session.query(Workflow).filter(Workflow.name == wf).first()) break else: wfs = session.query(Workflow).filter(Workflow.status == 'assistance-clone').all() #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all()) ## be careful then on clone case by case options.clone = True print "not supposed to function yet" return print len(wfs),"to reject" if len(wfs)>1: print "\n".join( [wfo.name for wfo in wfs] ) answer = raw_input('Reject these') if not answer.lower() in ['y','yes']: return for wfo in wfs: #wfo = session.query(Workflow).filter(Workflow.name == specific).first() if not wfo: print "cannot reject",spec return wfi = workflowInfo(url, wfo.name) comment="" if options.comments: comment = ", reason: "+options.comments if options.keep: wfi.sendLog('rejector','invalidating the workflow by unified operator%s'%comment) else: wfi.sendLog('rejector','invalidating the workflow and outputs by unified operator%s'%comment) results = invalidate(url, wfi, only_resub=True, with_output= (not options.keep)) if all(results): print wfo.name,"rejected" if options and options.clone: wfo.status = 'trouble' session.commit() schema = wfi.getSchema() schema['Requestor'] = os.getenv('USER') schema['Group'] = 'DATAOPS' schema['OriginalRequestName'] = wfo.name if 'ProcessingVersion' in schema: schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion else: schema['ProcessingVersion']=2 for k in schema.keys(): if k.startswith('Team'): schema.pop(k) if k.startswith('checkbox'): schema.pop(k) ## a few tampering of the original request if options.Memory: if schema['RequestType'] == 'TaskChain': it=1 while True: t = 'Task%d'%it it+=1 if t in schema: schema[t]['Memory'] = options.Memory else: break else: schema['Memory'] = options.Memory if options.Multicore: ## to do : set it properly in taskchains if schema['RequestType'] == 'TaskChain': tasks,set_to = options.Multicore.split(':') if ':' in options.Multicore else ("",options.Multicore) set_to = int(set_to) tasks = tasks.split(',') if tasks else ['Task1'] it = 1 while True: tt = 'Task%d'% it it+=1 if tt in schema: tname = schema[tt]['TaskName'] if tname in tasks or tt in tasks: mem = schema[tt]['Memory'] mcore = schema[tt].get('Multicore',1) factor = (set_to / float(mcore)) fraction_constant = 0.4 mem_per_core_c = int((1-fraction_constant) * mem / float(mcore)) print "mem per core", mem_per_core_c print "base mem", mem ## adjusting the parameter in the clone schema[tt]['Memory'] += (set_to-mcore)*mem_per_core_c schema[tt]['Multicore'] = set_to schema[tt]['TimePerEvent'] /= factor else: break else: schema['Multicore'] = options.Multicore if options.deterministic: if schema['RequestType'] == 'TaskChain': schema['Task1']['DeterministicPileup'] = True if options.EventsPerJob: if schema['RequestType'] == 'TaskChain': schema['Task1']['EventsPerJob'] = options.EventsPerJob else: schema['EventsPerJob'] = options.EventsPerJob if options.EventAwareLumiBased: schema['SplittingAlgo'] = 'EventAwareLumiBased' if options.TimePerEvent: schema['TimePerEvent'] = options.TimePerEvent if options.ProcessingString: schema['ProcessingString'] = options.ProcessingString if options.AcquisitionEra: schema['AcquisitionEra'] = options.AcquisitionEra if options.runs: schema['RunWhitelist'] = map(int,options.runs.split(',')) if options.PrepID: schema['PrepID'] =options.PrepID if schema['RequestType'] == 'TaskChain' and options.no_output: ntask = schema['TaskChain'] for it in range(1,ntask-1): schema['Task%d'%it]['KeepOutput'] = False schema['TaskChain'] = ntask-1 schema.pop('Task%d'%ntask) if options.priority: schema['RequestPriority'] = options.priority ## update to the current priority schema['RequestPriority'] = wfi.request['RequestPriority'] ## drop shit on the way to reqmgr2 schema = reqMgrClient.purgeClonedSchema( schema ) print "submitting" if (options.to_stepchain and (schema['RequestType']=='TaskChain')): ## transform the schema into StepChain schema print "Transforming a TaskChain into a StepChain" mcore = 0 mem = 0 schema['RequestType'] = 'StepChain' schema['StepChain'] = schema.pop('TaskChain') schema['SizePerEvent'] = 0 schema['TimePerEvent'] = 0 step=1 s_n = {} while True: if 'Task%d'%step in schema: sname = 'Step%d'%step schema[sname] = schema.pop('Task%d'%step) tmcore = schema[sname].pop('Multicore') tmem = schema[sname].pop('Memory') if mcore and tmcore != mcore: wfi.sendLog('rejector','the conversion to stepchain encoutered different value of Multicore %d != %d'%( tmcore, mcore)) sendLog('rejector','the conversion of %s to stepchain encoutered different value of Multicore %d != %d'%( wfo.name, tmcore, mcore), level='critical') mcore = max(mcore, tmcore) mem = max(mem, tmem) schema[sname]['StepName'] = schema[sname].pop('TaskName') s_n[ schema[sname]['StepName'] ] = sname if 'InputTask' in schema[sname]: schema[sname]['InputStep'] = schema[sname].pop('InputTask') eff = 1. up_s = sname while True: ## climb up a step. supposedely already all converted up_s = s_n.get(schema[up_s].get('InputStep',None),None) if up_s: ## multiply with the efficiency eff *= schema[up_s].get('FilterEfficiency',1.) else: ## or stop there break if not 'KeepOutput' in schema[sname]: ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop schema[sname]['KeepOutput'] = False schema['TimePerEvent'] += eff*schema[sname].pop('TimePerEvent') schema['SizePerEvent'] += eff*schema[sname].pop('SizePerEvent') step+=1 else: break schema['Multicore'] = mcore schema['Memory'] = mem print json.dumps( schema, indent=2 ) newWorkflow = reqMgrClient.submitWorkflow(url, schema) if not newWorkflow: msg = "Error in cloning {}".format(wfo.name) print(msg) wfi.sendLog('rejector',msg) # Get the error message time.sleep(5) data = reqMgrClient.requestManagerPost(url, "/reqmgr2/data/request", schema) wfi.sendLog('rejector',data) print json.dumps( schema, indent=2 ) return print newWorkflow data = reqMgrClient.setWorkflowApproved(url, newWorkflow) print data wfi.sendLog('rejector','Cloned into %s by unified operator %s'%( newWorkflow, comment )) wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False) else: wfo.status = 'trouble' if options.set_trouble else 'forget' wfi.notifyRequestor('Rejected by unified operator %s'%( comment ),do_batch=False) session.commit() else: msg = "Error in rejecting {}: {}".format(wfo.name,results) print(msg) wfi.sendLog('rejector',msg)
def actor(url,options=None): mlock = moduleLock(wait=False ,silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor','Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps( action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps( sorted( wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle( wf_list ) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get( 'Parameters' , None) to_acdc = action_list[wfname].get( 'Action', None) == 'acdc' to_clone = action_list[wfname].get( 'Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['onhold','on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog('actor','Action submitted for something other than acdc, clone, bypass or hold for workflow %s'%wfname,level='critical') print json.dumps( action_list[wfname] , indent=2) continue if not tasks and to_acdc: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor',"%s and children are rejected"%wfname) else: wfi.sendLog('actor',"Failed to reject the request and dependents") sendLog('actor','Failed to reject the familly of %s'% wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned into %s"%(wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog('actor','Force-completing from workflow traffic controler request') WI.add(action='force', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) elif to_hold: wfi.sendLog('actor','Holding on workflow traffic controler request') WI.add(action='hold', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: wfi.sendLog('actor','WMErrors is blank for %s.'%wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') continue except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task= True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task= True wfi.sendLog('actor', "Skipping task %s because there is no acdc doc for it anyways."%(fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[fulltaskname]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s"%( action_list[wfname].get( 'user', 'unified'), task.split('/')[-1] , json.dumps(actions), ) reason = action_list[wfname].get( 'Reason', None) if reason: jira_comment += '\ndue to: %s'%(reason) #team = wfi.request['Teams'][0] team = 'production' parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s needs to be assigned by hand"%(acdc)) continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s failed to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s failed to get assigned for recovery"% acdc) else: wfi.sendLog('actor',"%s was assigned for recovery"% acdc) recovering.add( acdc ) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find({'prepid' : wfi.request['PrepID']}) if len(jiras)==1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog('actor','not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock'%( base_eos_dir, os.getpid() )) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual','recovering') session.commit() if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return