def batchor( url ): UC = unifiedConfiguration() SI = global_SI() CI = campaignInfo() BI = batchInfo() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] if UC.get("user_relval"): users = ','.join(UC.get("user_relval")) wfs = getWorkflows(url, 'assignment-approved', details=False, user=users, rtype='TaskChain') if wfs: # then there is likely work to be done all_wfs = getWorkflowsByName(url, wfs, details=True) wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add( wf['PrepID'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add( wf['PrepID'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial_override" : "notape", "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} relval_routing = UC.get('relval_routing') def pick_one_site( p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1: choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice( choose_from ) print "picked",picked,"from",choose_from p["parameters"]["SiteWhitelist"] = [picked] batches = BI.all() for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_setup ) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword",key print "with",augment_with setup = deep_update( setup, augment_with ) pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') BI.update( campaign, by_campaign[campaign]) # now update it in central CouchDB setup['name'] = campaign wmcoreCamp = parseMongoCampaigns(setup)[0] res = createCampaignConfig(wmcoreCamp) print "Campaign %s correctly created in ReqMgr2: %s" % (wmcoreCamp['CampaignName'], res) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) ##possible_sites = set(["T1_DE_KIT","T1_FR_CCIN2P3"]) ##hi_site = random.choice(list(possible_sites)) hi_site = "T2_CH_CERN" setup["parameters"]["SiteWhitelist"]=[ hi_site ] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') BI.update( campaign, by_hi_campaign[campaign]) # now update it in central CouchDB setup['name'] = campaign wmcoreCamp = parseMongoCampaigns(setup)[0] res = createCampaignConfig(wmcoreCamp) print "Campaign %s correctly created in ReqMgr2: %s" % (wmcoreCamp['CampaignName'], res) ## only new campaigns in announcement for new_campaign in list(set(add_on.keys())-set(CI.all(c_type='relval'))): ## this is new, and can be announced as such print new_campaign,"is new stuff" subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) # sendLog('batchor',text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in CI.all(c_type='relval'): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) if not all_in_batch: continue is_batch_done = all(map(lambda s : not s in ['completed','force-complete','running-open','running-closed','acquired','staged','staging','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable CI.pop( old_campaign ) ## or just drop it all together ? BI.pop( old_campaign ) print "batch",old_campaign," configuration was removed" res = deleteCampaignConfig(old_campaign) print "Campaign %s correctly deleted in ReqMgr2: %s" % (old_campaign, res) ## merge all anyways CI.update( add_on , c_type = 'relval')
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_extreme_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs), "closing" th_start = time.mktime(time.gmtime()) for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue if not options.manual and ( 'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159' .lower() in (wfo.name).lower() or 'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986' .lower() in (wfo.name).lower()): continue closers.append( CloseBuster( wfo=wfo, url=url, CI=CI, UC=UC, jump_the_line=jump_the_line, batch_goodness=batch_goodness, batch_go=batch_go, #stats = stats, batch_warnings=batch_warnings, batch_extreme_warnings=batch_extreme_warnings, all_late_files=all_late_files, held=held, )) run_threads = ThreadHandler(threads=closers, n_threads=options.threads, sleepy=10, timeout=None, verbose=True, label='closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira', False) else None print len( run_threads.threads), "finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out session.add(outO) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid": to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop(to.wfo.name) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop - th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads / run_threads.n_threads) > 0: sendLog('checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads), level='critical') sendEmail( 'checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" #if batch_warnings[ bname ]: # issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness # issues+="\n".join( sorted( batch_warnings[ bname ] )) # issues+="\n\n" if batch_extreme_warnings[bname]: subject = "Low Statistics for %s" % bname issues = "The following datasets have outstanding completion (<50%%) issues:\n\n" issues += "\n".join(sorted(batch_extreme_warnings[bname])) issues += "\n\n" elif batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = "" text += "Dear all,\n\n" text += "A batch of release validation workflows has finished.\n\n" text += "Batch ID:\n\n" text += "%s\n\n" % (bname) text += "Detail of the workflows\n\n" text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % ( bname) text += "%s\n\n" % (issues) text += "This is an automated message.\n\n" text += "" to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname) deleteCampaignConfig(bname) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor', 'Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def main(): """ Execute the whole logic for campaign configuration management """ options = parseArgs() client = mongo_client() db = client.unified.campaignsConfiguration if options.load: campaigns = [] content = json.loads(open(options.load).read()) for k, v in content.items(): up = {'name': k} #s = {"$set": v} #db.update( up, s ) ## replace the db content v['name'] = k if options.type: v['type'] = options.type db.replace_one(up, v) campaigns.append(v) print k, v replaceCampaigns(campaigns) sys.exit(0) if options.dump: uc = {} for content in db.find(): i = content.pop("_id") if content.get('type', None) != options.type: continue ## no relval if 'name' not in content: db.delete_one({'_id': i}) print "dropping", i, content, "because it is malformated" continue uc[content.pop("name")] = content print len(uc.keys()), "campaigns damp" open(options.dump, 'w').write(json.dumps(uc, indent=2, sort_keys=True)) sys.exit(0) if options.remove: if options.name: db.delete_one({'name': options.name}) # and delete it in central couch too deleteCampaignConfig(options.name) else: pass sys.exit(0) post = {} if options.configuration: try: post.update(json.loads(options.configuration)) except: post.update(json.loads(open(options.configuration).read())) post['name'] = options.name update = {} if options.parameter: name, value = options.parameter.split(':', 1) ## convert to int or float or object try: value = int(value) except: try: value = float(value) except: try: value = json.loads(value) except: # as string pass if '.' in name: path = list(name.split('.')) w = update for p in path[:-1]: w[p] = {} w = w[p] w[path[-1]] = value else: update[name] = value found = db.find_one({"name": options.name}) if found: up = {'_id': found['_id']} if post: print "replacing", options.name, "with values", post if options.type: post['type'] = options.type db.replace_one(up, post) ### Alan: can I assume options.name and options.configuration # contain the same campaign configuration?!?! replaceCampaigns(post) elif update: ## need to update a value if options.type: update['type'] = options.type print "updating", options.name, "with values", update db.update(up, {"$set": update}) ### And update it in central CouchDB as well thisDoc = deepcopy(found) thisDoc.update(update) replaceCampaigns(thisDoc) else: ## use that to show the value in the database # not other headers in the output, so that it can be json loadable found.pop('name') found.pop('_id') print json.dumps(found, indent=2, sort_keys=True) else: if post: ## entering a new value if options.type: post['type'] = options.type post.update({"name": options.name}) db.insert_one(post) createCampaign(post) elif update: if options.type: update['type'] = options.type update.update({"name": options.name}) db.insert_one(update) createCampaign(post) else: availables = [o["name"] for o in db.find()] print options.name, " Not found. ", len( availables), "available campaigns \n", "\n\t".join( sorted(availables))