def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows), "in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: wfi.sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) sendLog('injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status), level='critical') print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False ## add a check on validity of input datasets _, prim, par, sec = wfi.getIO() for d in list(prim) + list(par) + list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog( 'injector', "One of the input is not VALID. %s : %s" % (d, status_cache[d])) sendLog('injector', "One of the input of %s is not VALID. %s : %s" % (wf, d, status_cache[d]), level='critical') can_add = False ## check for any file in phedex, to verify existence _, ph_files, _, _ = getDatasetFiles(url, d) if not ph_files and not ('StoreResults' == wfi.request.setdefault( 'RequestType', None)): wfi.sendLog( 'injector', "One of the input has no file in phedex: %s" % d) sendLog('injector', "One of the input has no file in phedex: %s" % d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None) ## match keywords and technical constraints #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): # to_convert.add( wf ) # wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) # #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') for wf in to_convert: os.system( './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s' % wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget' ) wf.status = 'forget' session.commit() else: wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf) session.add(tri) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement\n%s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
for usor in usors: d =time.mktime(time.strptime("-".join(map(str,usor['RequestDate'])), "%Y-%m-%d-%H-%M-%S")) secondary_timeout[dataset] = max(secondary_timeout[dataset],d) if secondary_timeout[dataset]: ## different than zero delay_days = 30 delay = delay_days*24*60*60 # 30 days if (now-secondary_timeout[dataset])>delay: print "unlocking secondary input after",delay_days,"days" unlock = True tier = dataset.split('/')[-1] creators = getWorkflowByOutput( url, dataset , details=True) if not creators and not tier == 'RAW': ds_status = getDatasetStatus( dataset ) if not '-v0/' in dataset and ds_status!=None: sendEmail('failing get by output','%s has not been produced by anything?'%dataset) newly_locking.add(dataset) continue else: # does not matter, cannot be an OK dataset unlock = True bad_ds = True creators_status = [r['RequestStatus'] for r in creators] print "Statuses of workflow that made the dataset",dataset,"are",creators_status if all([status in ['failed','aborted','rejected','aborted-archived','rejected-archived'] for status in creators_status]): ## crap print "\tunlocking",dataset,"for bad workflow statuses" unlock = True bad_ds = True
def outcleanor(url, options): if options.approve: for user in ['*Vlimant']:#,'*Cremonesi']: deletes = listDelete( url , user = user) for (site,who,tid) in deletes: if 'MSS' in site: continue### ever print site,who,tid print "approving deletion" print approveSubscription(url, tid, nodes = [site], comments = 'Production cleaning by data ops') return sites_and_datasets = defaultdict(list) our_copies = defaultdict(list) wf_cleaned = {} wfs = [] for fetch in options.fetch.split(','): wfs.extend(session.query(Workflow).filter(Workflow.status==fetch).all()) random.shuffle( wfs ) last_answer = None for wfo in wfs : if options.number and len(wf_cleaned)>= options.number: print "Reached",options.number,"cleaned" break print '-'*100 wfi = workflowInfo(url, wfo.name) goes = {} # boolean per output for dataset in wfi.request['OutputDatasets']: goes[dataset] = False keep_one_out = True status = getDatasetStatus( dataset ) print "\n\tLooking at",dataset,status,"\n" vetoes = None if status == 'INVALID': vetoes = ['Export','Buffer'] ## can take themselves out keep_one_out = False # just wipe clean elif status == None: print dataset,"actually does not exist. skip" goes[dataset] = True continue elif status in ['PRODUCTION','VALID'] and wfo.status in ['forget','trouble']: print dataset,"should probably be invalidated. (",wfo.status,") skip" keep_one_out = False # just wipe clean continue ## you are not sure. just skip it for the time being elif status == 'PRODUCTION' and wfo.status in ['clean']: print dataset,"should probably be set valid .skip" continue ## you are not sure. just skip it for the time being if status == 'VALID' and dataset.startswith('/MinBias'): print "This is a /MinBias. skip" continue if '/DQM' in dataset: keep_one_out = False total_size = getDatasetSize( dataset ) our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes) also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes) ## merge in one unique dict for site in also_our_presence: if site in our_presence: there,frac = our_presence[site] other,ofrac = also_our_presence[site] our_presence[site] = (max(there,other),max(frac,ofrac)) else: our_presence[site] = also_our_presence[site] if our_presence: print our_presence ## analysis ops copies need to be taken into account anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps") own_by_anaops = anaops_presence.keys() ## all our copies to_be_cleaned = our_presence.keys() if not len(to_be_cleaned): print "nowhere to be found of ours,",len(own_by_anaops),"in analysi ops pool" goes[dataset] = True continue print "Where we own bits of dataset" print to_be_cleaned if len(own_by_anaops): ## remove site with the anaops copies to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops)) keep_one_out = False ## in that case, just remove our copies print "Own by anaops (therefore not keep a copy of ours)" print own_by_anaops else: ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet using_the_same = getWorkflowByInput(url, dataset, details=True) conflict = False for other in using_the_same: if other['RequestName'] == wfo.name: continue if other['RequestType'] == 'Resubmission': continue if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','rejected-archived','closed-out','None',None]: print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset conflict=True break if conflict: continue ## not being used. a bit less dangerous to clean-out ## keep one full copy out there full_copies = [site for (site,(there,fract)) in our_presence.items() if there] if keep_one_out: if not len(full_copies): print "we do not own a full copy of",dataset,status,wfo.status,".skip" continue stay_there = random.choice( full_copies ) #at a place own by ops print "Where we keep a full copy", stay_there to_be_cleaned.remove( stay_there ) our_copies[stay_there].append( dataset ) else: print "We do not want to keep a copy of ",dataset,status,wfo.status if len(to_be_cleaned): print "Where we can clean" print to_be_cleaned for site in to_be_cleaned: sites_and_datasets[site].append( (dataset, total_size*our_presence[site][1]/100., status) ) goes[dataset] = True else: print "no cleaning to be done" goes[dataset] = True print wfo.name,"scrutinized" if all(goes.values()): print "\t",wfo.name,"can toggle -out" def ask(): global last_answer last_answer = raw_input('go on ?') return last_answer if options.auto or ask() in ['y','']: if all(goes.values()): wfo.status = wfo.status+'-out' wf_cleaned[wfo.name] = wfo.status continue elif last_answer in ['q','n']: break else: return if options.auto: pass elif last_answer in ['q']: return print "Potential cleanups" for (site,items) in sites_and_datasets.items(): cleanup = sum([size for (_,size,_) in items]) print "\n\t potential cleanup of","%8.4f"%cleanup,"GB at ",site print "\n".join([ds+" "+st for ds,_,st in items]) datasets = [ ds for ds,_,st in items] print "Copies and bits we are going to delete" print json.dumps( sites_and_datasets, indent=2) print "Copies we are keeping" print json.dumps( our_copies, indent=2 ) print "Workflows cleaned for output" print json.dumps( wf_cleaned, indent=2 ) stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) open('outcleaning_%s.json'%stamp,'w').write( json.dumps( sites_and_datasets, indent=2)) open('keepcopies_%s.json'%stamp,'w').write( json.dumps( our_copies, indent=2)) open('wfcleanout_%s.json'%stamp,'w').write( json.dumps( wf_cleaned, indent=2)) if (not options.test) and (options.auto or raw_input("Satisfied ? (y will trigger status change and deletion requests)") in ['y']): for (site,items) in sites_and_datasets.items(): datasets = [ ds for ds,_,st in items] print "making deletion to",site result = makeDeleteRequest(url, site, datasets, "Cleanup output after production. DataOps will take care of approving it.") print result ## approve it right away ? if 'MSS' in site: continue if 'Export' in site: continue if 'Buffer' in site: continue for did in [item['id'] for item in result['phedex']['request_created']]: print "auto-approve disabled, but ready" #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion') pass session.commit() else: print "Not making the deletion and changing statuses"
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira'] ) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows),"in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical') print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical') can_add = False #else: # ##make sure that all blocks get closed # closeAllBlocks(url, d) ## check for any file in phedex, to verify existence _,ph_files,_,_ = getDatasetFiles(url, d) if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ): wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d ) sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) ## match keywords and technical constraints if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): to_convert.add( wf ) wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical') for wf in to_convert: os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget') wf.status = 'forget' session.commit() else: wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp( phedexid = tr.phedexid, workflow = new_wf) session.add( tri ) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
d = time.mktime( time.strptime("-".join(map(str, usor['RequestDate'])), "%Y-%m-%d-%H-%M-%S")) secondary_timeout[dataset] = max(secondary_timeout[dataset], d) if secondary_timeout[dataset]: ## different than zero delay_days = 30 delay = delay_days * 24 * 60 * 60 # 30 days if (now - secondary_timeout[dataset]) > delay: print "unlocking secondary input after", delay_days, "days" unlock = True tier = dataset.split('/')[-1] creators = getWorkflowByOutput(url, dataset, details=True) if not creators and not tier == 'RAW': ds_status = getDatasetStatus(dataset) if not '-v0/' in dataset and ds_status != None: sendEmail('failing get by output', '%s has not been produced by anything?' % dataset) newly_locking.add(dataset) continue else: # does not matter, cannot be an OK dataset unlock = True bad_ds = True creators_status = [r['RequestStatus'] for r in creators] print "Statuses of workflow that made the dataset", dataset, "are", creators_status if all([ status in [ 'failed', 'aborted', 'rejected', 'aborted-archived', 'rejected-archived'
def run(self): site = self.site print "checking on site", site si = self.SI UC = self.UC RDI = self.RDI options = self.options locks = self.locks waiting = self.waiting stuck = self.stuck missing = self.missing remainings = {} ds = si.getRemainingDatasets(si.CE_to_SE(site)) #print len(ds) taken_size = 0. sum_waiting = 0. sum_stuck = 0. sum_missing = 0. sum_unlocked = 0. n_ds = options.ndatasets i_ds = 0 ds_threads = [] for i_ds, (size, dataset) in enumerate(ds): if n_ds and i_ds >= n_ds: break remainings[dataset] = {"size": size, "reasons": []} #print "-"*10 if not dataset in locks: #print dataset,"is not locked" sum_unlocked += size remainings[dataset]["reasons"].append('unlock') else: remainings[dataset]["reasons"].append('lock') if dataset in waiting: #print dataset,"is waiting for custodial" sum_waiting += size remainings[dataset]["reasons"].append('tape') if dataset in stuck: sum_stuck += size remainings[dataset]["reasons"].append('stuck-tape') if dataset in missing: sum_missing += size remainings[dataset]["reasons"].append('missing-tape') ds_threads.append(DatasetCheckBuster(dataset=dataset, url=url)) run_threads = ThreadHandler(threads=ds_threads, label='%s Dataset Threads' % site, n_threads=10, start_wait=0, timeout=None, verbose=True) ## start and sync run_threads.run() #run_threads.start() #while run_threads.is_alive(): # time.sleep(10) for t in run_threads.threads: remainings[t.dataset]["reasons"].extend(t.reasons) remainings[t.dataset]["reasons"].sort() print t.dataset, remainings[t.dataset]["reasons"] #print "\t",sum_waiting,"[GB] could be freed by custodial" print "\t", sum_unlocked, "[GB] is not locked by unified" print "updating database with remaining datasets" RDI.set(site, remainings) try: eosFile('%s/remaining_%s.json' % (monitor_dir, site), 'w').write(json.dumps(remainings, indent=2)).close() except: pass ld = remainings.items() ld.sort(key=lambda i: i[1]['size'], reverse=True) table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>" % ( time.asctime(time.gmtime()), site) accumulate = defaultdict(lambda: defaultdict(float)) for item in remainings: tier = item.split('/')[-1] for reason in remainings[item]['reasons']: accumulate[reason][tier] += remainings[item]['size'] table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>" for reason in accumulate: s = 0 table += "<tr><td>%s</td><td><ul>" % reason subitems = accumulate[reason].items() subitems.sort(key=lambda i: i[1], reverse=True) for tier, ss in subitems: table += "<li> %s : %10.3f</li>" % (tier, ss / 1024.) s += ss / 1024. table += "</ul>total : %.3f</td>" % s table += "</table>\n" table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n" only_unlock = set() for item in ld: ds_name = item[0] reasons = item[1]['reasons'] sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>' % ( ds_name, ds_name) if 'unlock' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>' % ( ds_name, site) if 'unlock' in reasons or 'input' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>' % ( ds_name) if 'unlock' in reasons or 'output' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>' % ( ds_name) if 'pilup' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>' % ( ds_name) table += "<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n" % ( sub_url, item[1]['size'], "<li>".join([""] + reasons)) if reasons == ['unlock']: only_unlock.add(item[0]) table += "</table></html>" eosFile('%s/remaining_%s.html' % (monitor_dir, site), 'w').write(table).close() print "checking on unlock only datasets" to_ddm = UC.get('tiers_to_DDM') #look_at = list(only_unlock) look_at = list(only_unlock)[:20] #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')]) for item in look_at: tier = item.split('/')[-1] ds_status = getDatasetStatus(item) print item, ds_status if ds_status == 'PRODUCTION': print item, "is found", ds_status, "and unklocked on", site if options.invalidate_anything_left_production_once_unlocked: print "Setting status to invalid for", item setDatasetStatus(item, 'INVALID') if tier in to_ddm: print item, "looks like analysis and still dataops on", site if options.change_dataops_subs_to_anaops_once_unlocked: print "Sending", item, "to anaops" allCompleteToAnaOps(url, item)
def injector(url, options, specific): use_mcm = True up = componentInfo( mcm = use_mcm, soft=['mcm'] ) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table print len(workflows),"in line" cannot_inject = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) #wl = getWorkLoad(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match # print wfi.request familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: #req_familly = getWorkflowById( url, wl['PrepID']) #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly] pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d])) can_add = False if not can_add: continue wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='warning') ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove( wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid,"got",new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid<0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement, %s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
def outcleanor(url, options): if options.approve: for user in ['*Vlimant']: #,'*Cremonesi']: deletes = listDelete(url, user=user) for (site, who, tid) in deletes: if 'MSS' in site: continue ### ever print site, who, tid print "approving deletion" print approveSubscription( url, tid, nodes=[site], comments='Production cleaning by data ops') return sites_and_datasets = defaultdict(list) our_copies = defaultdict(list) wf_cleaned = {} wfs = [] for fetch in options.fetch.split(','): wfs.extend( session.query(Workflow).filter(Workflow.status == fetch).all()) random.shuffle(wfs) last_answer = None for wfo in wfs: if options.number and len(wf_cleaned) >= options.number: print "Reached", options.number, "cleaned" break print '-' * 100 wfi = workflowInfo(url, wfo.name) goes = {} # boolean per output for dataset in wfi.request['OutputDatasets']: goes[dataset] = False keep_one_out = True status = getDatasetStatus(dataset) print "\n\tLooking at", dataset, status, "\n" vetoes = None if status == 'INVALID': vetoes = ['Export', 'Buffer'] ## can take themselves out keep_one_out = False # just wipe clean elif status == None: print dataset, "actually does not exist. skip" goes[dataset] = True continue elif status in ['PRODUCTION', 'VALID' ] and wfo.status in ['forget', 'trouble']: print dataset, "should probably be invalidated. (", wfo.status, ") skip" keep_one_out = False # just wipe clean continue ## you are not sure. just skip it for the time being elif status == 'PRODUCTION' and wfo.status in ['clean']: print dataset, "should probably be set valid .skip" continue ## you are not sure. just skip it for the time being if status == 'VALID' and dataset.startswith('/MinBias'): print "This is a /MinBias. skip" continue if '/DQM' in dataset: keep_one_out = False total_size = getDatasetSize(dataset) our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes) also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes) ## merge in one unique dict for site in also_our_presence: if site in our_presence: there, frac = our_presence[site] other, ofrac = also_our_presence[site] our_presence[site] = (max(there, other), max(frac, ofrac)) else: our_presence[site] = also_our_presence[site] if our_presence: print our_presence ## analysis ops copies need to be taken into account anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps") own_by_anaops = anaops_presence.keys() ## all our copies to_be_cleaned = our_presence.keys() if not len(to_be_cleaned): print "nowhere to be found of ours,", len( own_by_anaops), "in analysi ops pool" goes[dataset] = True continue print "Where we own bits of dataset" print to_be_cleaned if len(own_by_anaops): ## remove site with the anaops copies to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops)) keep_one_out = False ## in that case, just remove our copies print "Own by anaops (therefore not keep a copy of ours)" print own_by_anaops else: ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet using_the_same = getWorkflowByInput(url, dataset, details=True) conflict = False for other in using_the_same: if other['RequestName'] == wfo.name: continue if other['RequestType'] == 'Resubmission': continue if not other['RequestStatus'] in [ 'announced', 'normal-archived', 'aborted', 'rejected', 'aborted-archived', 'rejected-archived', 'closed-out', 'None', None ]: print other['RequestName'], 'is in status', other[ 'RequestStatus'], 'preventing from cleaning', dataset conflict = True break if conflict: continue ## not being used. a bit less dangerous to clean-out ## keep one full copy out there full_copies = [ site for (site, (there, fract)) in our_presence.items() if there ] if keep_one_out: if not len(full_copies): print "we do not own a full copy of", dataset, status, wfo.status, ".skip" continue stay_there = random.choice( full_copies) #at a place own by ops print "Where we keep a full copy", stay_there to_be_cleaned.remove(stay_there) our_copies[stay_there].append(dataset) else: print "We do not want to keep a copy of ", dataset, status, wfo.status if len(to_be_cleaned): print "Where we can clean" print to_be_cleaned for site in to_be_cleaned: sites_and_datasets[site].append( (dataset, total_size * our_presence[site][1] / 100., status)) goes[dataset] = True else: print "no cleaning to be done" goes[dataset] = True print wfo.name, "scrutinized" if all(goes.values()): print "\t", wfo.name, "can toggle -out" def ask(): global last_answer last_answer = raw_input('go on ?') return last_answer if options.auto or ask() in ['y', '']: if all(goes.values()): wfo.status = wfo.status + '-out' wf_cleaned[wfo.name] = wfo.status continue elif last_answer in ['q', 'n']: break else: return if options.auto: pass elif last_answer in ['q']: return print "Potential cleanups" for (site, items) in sites_and_datasets.items(): cleanup = sum([size for (_, size, _) in items]) print "\n\t potential cleanup of", "%8.4f" % cleanup, "GB at ", site print "\n".join([ds + " " + st for ds, _, st in items]) datasets = [ds for ds, _, st in items] print "Copies and bits we are going to delete" print json.dumps(sites_and_datasets, indent=2) print "Copies we are keeping" print json.dumps(our_copies, indent=2) print "Workflows cleaned for output" print json.dumps(wf_cleaned, indent=2) stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) open('outcleaning_%s.json' % stamp, 'w').write(json.dumps(sites_and_datasets, indent=2)) open('keepcopies_%s.json' % stamp, 'w').write(json.dumps(our_copies, indent=2)) open('wfcleanout_%s.json' % stamp, 'w').write(json.dumps(wf_cleaned, indent=2)) if (not options.test) and (options.auto or raw_input( "Satisfied ? (y will trigger status change and deletion requests)") in ['y']): for (site, items) in sites_and_datasets.items(): datasets = [ds for ds, _, st in items] print "making deletion to", site result = makeDeleteRequest( url, site, datasets, "Cleanup output after production. DataOps will take care of approving it." ) print result ## approve it right away ? if 'MSS' in site: continue if 'Export' in site: continue if 'Buffer' in site: continue for did in [ item['id'] for item in result['phedex']['request_created'] ]: print "auto-approve disabled, but ready" #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion') pass session.commit() else: print "Not making the deletion and changing statuses"
if secondary_timeout[dataset]: ## different than zero delay_days = 30 delay = delay_days * 24 * 60 * 60 # 30 days if (now - secondary_timeout[dataset]) > delay: print "unlocking secondary input after", delay_days, "days" unlock = True else: print "keep a lock on secondary within", delay_days, "days" unlock = False newly_locking.add(dataset) continue tier = dataset.split("/")[-1] creators = getWorkflowByOutput(url, dataset, details=True) if not creators and not tier == "RAW" and not "-PromptReco-" in dataset: ds_status = getDatasetStatus(dataset) if not "-v0/" in dataset and ds_status != None: # sendEmail('failing get by output','%s has not been produced by anything?'%dataset) sendLog( "lockor", "failing get by output, %s has not been produced by anything?" % dataset, level="critical" ) newly_locking.add(dataset) continue else: # does not matter, cannot be an OK dataset unlock = True bad_ds = True creators_status = [r["RequestStatus"] for r in creators] print "Statuses of workflow that made the dataset", dataset, "are", creators_status if len(creators_status) and all( [
def run(self): site = self.site print "checking on site",site si = self.SI UC = self.UC RDI = self.RDI options = self.options locks = self.locks waiting = self.waiting stuck = self.stuck missing = self.missing remainings = {} ds = si.getRemainingDatasets(si.CE_to_SE(site)) #print len(ds) taken_size=0. sum_waiting=0. sum_stuck=0. sum_missing=0. sum_unlocked=0. n_ds = options.ndatasets i_ds = 0 ds_threads = [] for i_ds,(size,dataset) in enumerate(ds): if n_ds and i_ds>=n_ds: break remainings[dataset] = {"size" : size, "reasons": []} #print "-"*10 if not dataset in locks: #print dataset,"is not locked" sum_unlocked += size remainings[dataset]["reasons"].append('unlock') else: remainings[dataset]["reasons"].append('lock') if dataset in waiting: #print dataset,"is waiting for custodial" sum_waiting+=size remainings[dataset]["reasons"].append('tape') if dataset in stuck: sum_stuck+=size remainings[dataset]["reasons"].append('stuck-tape') if dataset in missing: sum_missing +=size remainings[dataset]["reasons"].append('missing-tape') ds_threads.append( DatasetCheckBuster( dataset = dataset, url = url)) run_threads = ThreadHandler( threads = ds_threads, label = '%s Dataset Threads'%site, n_threads = 10 , start_wait = 0, timeout = None, verbose=True) ## start and sync run_threads.run() #run_threads.start() #while run_threads.is_alive(): # time.sleep(10) for t in run_threads.threads: remainings[t.dataset]["reasons"].extend( t.reasons ) remainings[t.dataset]["reasons"].sort() print t.dataset,remainings[t.dataset]["reasons"] #print "\t",sum_waiting,"[GB] could be freed by custodial" print "\t",sum_unlocked,"[GB] is not locked by unified" print "updating database with remaining datasets" RDI.set(site, remainings) try: eosFile('%s/remaining_%s.json'%(monitor_dir,site),'w').write( json.dumps( remainings , indent=2)).close() except: pass ld = remainings.items() ld.sort( key = lambda i:i[1]['size'], reverse=True) table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>"%(time.asctime(time.gmtime()),site) accumulate = defaultdict(lambda : defaultdict(float)) for item in remainings: tier = item.split('/')[-1] for reason in remainings[item]['reasons']: accumulate[reason][tier] += remainings[item]['size'] table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>" for reason in accumulate: s=0 table += "<tr><td>%s</td><td><ul>"% reason subitems = accumulate[reason].items() subitems.sort(key = lambda i:i[1], reverse=True) for tier,ss in subitems: table += "<li> %s : %10.3f</li>"%( tier, ss/1024.) s+= ss/1024. table+="</ul>total : %.3f</td>"%s table += "</table>\n" table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n" only_unlock = set() for item in ld: ds_name = item[0] reasons = item[1]['reasons'] sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>'%(ds_name, ds_name) if 'unlock' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>'%(ds_name, site) if 'unlock' in reasons or 'input' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>'%(ds_name) if 'unlock' in reasons or 'output' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>'%(ds_name) if 'pilup' in reasons: sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>'%(ds_name) table+="<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n"%( sub_url, item[1]['size'], "<li>".join([""]+reasons)) if reasons==['unlock']: only_unlock.add(item[0]) table+="</table></html>" eosFile('%s/remaining_%s.html'%(monitor_dir,site),'w').write( table ).close() print "checking on unlock only datasets" to_ddm = UC.get('tiers_to_DDM') #look_at = list(only_unlock) look_at = list(only_unlock)[:20] #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')]) for item in look_at: tier = item.split('/')[-1] ds_status = getDatasetStatus(item) print item,ds_status if ds_status == 'PRODUCTION': print item,"is found",ds_status,"and unklocked on",site if options.invalidate_anything_left_production_once_unlocked: print "Setting status to invalid for",item setDatasetStatus(item, 'INVALID') if tier in to_ddm: print item,"looks like analysis and still dataops on",site if options.change_dataops_subs_to_anaops_once_unlocked: print "Sending",item,"to anaops" allCompleteToAnaOps(url, item)