Esempio n. 1
0
def parse_those(url, options=None, those=[]):

    explanations = defaultdict(set)
    alls = {}
    threads = []

    for wfn in those:
        threads.append(ParseBuster(url=url, wfn=wfn, options=options))

    #threads = thread[:5]
    run_threads = ThreadHandler(
        threads=threads,
        n_threads=options.threads,  # if options else 5,
        sleepy=10,
        timeout=None,
        verbose=True)

    print "running all workers"
    run_threads.start()
    while run_threads.is_alive():
        time.sleep(10)

    print "all threads completed"
    for worker in threads:
        task_error = worker.task_error
        one_explanation = worker.one_explanation
        alls.update(task_error)
        for code in one_explanation:
            explanations[code].update(one_explanation[code])

    #open('%s/all_errors.json'%monitor_dir,'w').write( json.dumps(alls , indent=2 ))
    eosFile('%s/all_errors.json' % monitor_dir,
            'w').write(json.dumps(alls, indent=2)).close()

    explanations = dict([(k, list(v)) for k, v in explanations.items()])

    #open('%s/explanations.json'%monitor_dir,'w').write( json.dumps(explanations, indent=2))
    eosFile('%s/explanations.json' % monitor_dir,
            'w').write(json.dumps(explanations, indent=2)).close()

    #alls = json.loads( open('all_errors.json').read())

    affected = set()
    per_code = defaultdict(set)
    for task in alls:
        for code in alls[task]:
            per_code[code].add(task.split('/')[1])

    for code in per_code:
        print code
        print json.dumps(sorted(per_code[code]), indent=2)
Esempio n. 2
0
        for wf in sorted(wf_for[site][camp]):
            report +="\t %s \n"%wf
    #print report

if not_runable_acdc:
    sendLog('GQ','These %s ACDC cannot run \n%s'%( len(not_runable_acdc),
                                                   '\n'.join(sorted(not_runable_acdc))
                                                   ),level='critical')


old_stuck_all_done = set(json.loads(eosRead('%s/stuck_all_done.json'%base_eos_dir)))
really_stuck_all_done = old_stuck_all_done & stuck_all_done
if really_stuck_all_done:
    sendLog('GQ','These %d workflows have not toggled further to completed while all WQE are done\n%s'%( len(really_stuck_all_done),'\n'.join(sorted(really_stuck_all_done))),
            level='critical')
eosFile('%s/stuck_all_done.json'%base_eos_dir,'w').write( json.dumps( sorted( stuck_all_done), indent=2)).close()

if failed_workflow:
    sendLog('GQ','These workflows have failed wqe and will stay stuck:\n%s'%('\n'.join( failed_workflow)))
    pass

if agents_down:
    for agent,tasks in agents_down.iteritems():
        if not tasks: continue
        #sendLog('GQ','These tasks look stalled in agent %s \n%s'%( agent, '\n'.join(sorted(tasks))),level='critical')
        pass


unproc = "\n\nUnprocessable blocks : i.e no overlap of the site whitelist and the location\n\n"
unproc += '\n'.join(sorted(unprocessable))
report += unproc
Esempio n. 3
0
    def run(self):
        site = self.site
        print "checking on site", site
        si = self.SI
        UC = self.UC
        RDI = self.RDI
        options = self.options
        locks = self.locks
        waiting = self.waiting
        stuck = self.stuck
        missing = self.missing
        remainings = {}

        ds = si.getRemainingDatasets(si.CE_to_SE(site))
        #print len(ds)
        taken_size = 0.
        sum_waiting = 0.
        sum_stuck = 0.
        sum_missing = 0.
        sum_unlocked = 0.
        n_ds = options.ndatasets
        i_ds = 0
        ds_threads = []
        for i_ds, (size, dataset) in enumerate(ds):
            if n_ds and i_ds >= n_ds: break
            remainings[dataset] = {"size": size, "reasons": []}
            #print "-"*10
            if not dataset in locks:
                #print dataset,"is not locked"
                sum_unlocked += size
                remainings[dataset]["reasons"].append('unlock')
            else:
                remainings[dataset]["reasons"].append('lock')
            if dataset in waiting:
                #print dataset,"is waiting for custodial"
                sum_waiting += size
                remainings[dataset]["reasons"].append('tape')

            if dataset in stuck:
                sum_stuck += size
                remainings[dataset]["reasons"].append('stuck-tape')
            if dataset in missing:
                sum_missing += size
                remainings[dataset]["reasons"].append('missing-tape')

            ds_threads.append(DatasetCheckBuster(dataset=dataset, url=url))

        run_threads = ThreadHandler(threads=ds_threads,
                                    label='%s Dataset Threads' % site,
                                    n_threads=10,
                                    start_wait=0,
                                    timeout=None,
                                    verbose=True)
        ## start and sync
        run_threads.run()
        #run_threads.start()
        #while run_threads.is_alive():
        #    time.sleep(10)

        for t in run_threads.threads:
            remainings[t.dataset]["reasons"].extend(t.reasons)
            remainings[t.dataset]["reasons"].sort()
            print t.dataset, remainings[t.dataset]["reasons"]

        #print "\t",sum_waiting,"[GB] could be freed by custodial"
        print "\t", sum_unlocked, "[GB] is not locked by unified"

        print "updating database with remaining datasets"
        RDI.set(site, remainings)
        try:
            eosFile('%s/remaining_%s.json' % (monitor_dir, site),
                    'w').write(json.dumps(remainings, indent=2)).close()
        except:
            pass

        ld = remainings.items()
        ld.sort(key=lambda i: i[1]['size'], reverse=True)
        table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>" % (
            time.asctime(time.gmtime()), site)

        accumulate = defaultdict(lambda: defaultdict(float))
        for item in remainings:
            tier = item.split('/')[-1]

            for reason in remainings[item]['reasons']:
                accumulate[reason][tier] += remainings[item]['size']
        table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>"
        for reason in accumulate:
            s = 0
            table += "<tr><td>%s</td><td><ul>" % reason
            subitems = accumulate[reason].items()
            subitems.sort(key=lambda i: i[1], reverse=True)

            for tier, ss in subitems:
                table += "<li> %s : %10.3f</li>" % (tier, ss / 1024.)
                s += ss / 1024.
            table += "</ul>total : %.3f</td>" % s

        table += "</table>\n"
        table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n"
        only_unlock = set()
        for item in ld:
            ds_name = item[0]
            reasons = item[1]['reasons']
            sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>' % (
                ds_name, ds_name)
            if 'unlock' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>' % (
                    ds_name, site)
            if 'unlock' in reasons or 'input' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>' % (
                    ds_name)
            if 'unlock' in reasons or 'output' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>' % (
                    ds_name)
            if 'pilup' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>' % (
                    ds_name)
            table += "<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n" % (
                sub_url, item[1]['size'], "<li>".join([""] + reasons))
            if reasons == ['unlock']:
                only_unlock.add(item[0])
        table += "</table></html>"
        eosFile('%s/remaining_%s.html' % (monitor_dir, site),
                'w').write(table).close()

        print "checking on unlock only datasets"
        to_ddm = UC.get('tiers_to_DDM')
        #look_at = list(only_unlock)
        look_at = list(only_unlock)[:20]
        #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')])
        for item in look_at:
            tier = item.split('/')[-1]
            ds_status = getDatasetStatus(item)
            print item, ds_status
            if ds_status == 'PRODUCTION':
                print item, "is found", ds_status, "and unklocked on", site
                if options.invalidate_anything_left_production_once_unlocked:
                    print "Setting status to invalid for", item
                    setDatasetStatus(item, 'INVALID')
            if tier in to_ddm:
                print item, "looks like analysis and still dataops on", site
                if options.change_dataops_subs_to_anaops_once_unlocked:
                    print "Sending", item, "to anaops"
                    allCompleteToAnaOps(url, item)
Esempio n. 4
0
    sendLog('GQ',
            'These %s ACDC cannot run \n%s' %
            (len(not_runable_acdc), '\n'.join(sorted(not_runable_acdc))),
            level='critical')

old_stuck_all_done = set(
    json.loads(eosRead('%s/stuck_all_done.json' % base_eos_dir)))
really_stuck_all_done = old_stuck_all_done & stuck_all_done
if really_stuck_all_done:
    sendLog(
        'GQ',
        'These %d workflows have not toggled further to completed while all WQE are done\n%s'
        %
        (len(really_stuck_all_done), '\n'.join(sorted(really_stuck_all_done))),
        level='critical')
eosFile('%s/stuck_all_done.json' % base_eos_dir,
        'w').write(json.dumps(sorted(stuck_all_done), indent=2)).close()

if failed_workflow:
    sendLog(
        'GQ', 'These workflows have failed wqe and will stay stuck:\n%s' %
        ('\n'.join(failed_workflow)))
    pass

if agents_down:
    for agent, tasks in agents_down.iteritems():
        if not tasks: continue
        #sendLog('GQ','These tasks look stalled in agent %s \n%s'%( agent, '\n'.join(sorted(tasks))),level='critical')
        pass

unproc = "\n\nUnprocessable blocks : i.e no overlap of the site whitelist and the location\n\n"
unproc += '\n'.join(sorted(unprocessable))
Esempio n. 5
0
            print "\nrelocking",dataset
            newly_locking.add(dataset) 
           
        time_point("Checked all")
    except Exception as e:
        print "Error in checking unlockability. relocking",dataset
        print str(e)
        newly_locking.add(dataset)


## just for a couple of rounds
waiting_for_custodial={}
stuck_custodial={}
lagging_custodial={}
missing_approval_custodial={}
eosFile('%s/waiting_custodial.json'%monitor_dir,'w').write( json.dumps( waiting_for_custodial , indent=2) ).close()
eosFile('%s/stuck_custodial.json'%monitor_pub_dir,'w').write( json.dumps( stuck_custodial , indent=2) ).close()
eosFile('%s/lagging_custodial.json'%monitor_dir,'w').write( json.dumps( lagging_custodial , indent=2) ).close()
eosFile('%s/missing_approval_custodial.json'%monitor_dir,'w').write( json.dumps( missing_approval_custodial , indent=2) ).close()

## then for all that would have been invalidated from the past, check whether you can unlock the wf based on output
for wfo in session.query(Workflow).filter(Workflow.status=='forget').all():
    wfi = workflowInfo(url, wfo.name)
    if all([o not in newly_locking for o in wfi.request['OutputDatasets']]) and not 'unlock' in wfo.status:
        wfo.status +='-unlock'
        print "then setting",wfo.name,"to",wfo.status
    session.commit()

time_point("verified those in forget")

for item in also_locking_from_reqmgr: 
Esempio n. 6
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return 


    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira',False) else None

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]
        

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir)))
            break
        except:
            time.sleep(2)
        
    for itry in range(5):
         try:
             ## take into account the block that needed to be repositioned recently
             all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] )
             break
         except:
             time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = { 110000 : 21,
                                 90000 : 28,
                            #     80000 : 60,
                            #0 : 90
                             }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user,spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in ['force-complete', 'completed']:
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue



        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task,outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask( task )
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60*60*24.)

        priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[]))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.)
            priority_delay = now - then ## in days
            print "priority was set to",priority,priority_delay,"[days] ago"

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.)
            delay = now - then ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay))
        if priority_delay!=0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url, original.request['OriginalRequestName'])
        injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.)
            injection_delay = now - injected_on
        

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority
        
        pop_a_jira = False
        ping_on_jira = 7 *(24*60*60) # 7 days
        for jp,jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'],
                                                    priority = wfi.request['RequestPriority'],
                                                    label = 'Late',
                                                    reopen = True)
            last_time = JC.last_time( j )
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d"%( delay, priority)
                JC.comment(j.key, j_comment)
            

        if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000 ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized
            tail_cutting_priority += 101 ## to signal it is from this mechanism
            tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority
            tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value
            
            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)
                    max_priority-=1
                else:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c,14) for c in campaigns])
            
        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay: 
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = wfi.getCompletionFraction(caller='completor')
        
        if not percent_completions:
            sendLog('completor','%s has no output at all'% wfo.name, level='critical')
            continue

        is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay)
        is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions]))

        if is_over_completion:
            wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ),
                                                                       json.dumps( percent_completions, indent=2 )
                                                                       ))
        elif is_over_allowed_delay:
            wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_over_truncation_delay:
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ),
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), 
                                                                                                 json.dumps(good_fraction_per_out, indent=2),
                                                                                                 json.dumps( truncate_fraction_per_out, indent=2),
                                                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then
            

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force>0:
            print "going for force-complete of",wfo.name
            if not safe_mode:
                forceComplete(url, wfi )
                set_force_complete.add( wfo.name )
                wfi.sendLog('completor','going for force completing')
                wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
                max_force -=1
            else:
                sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name)
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close()

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text
Esempio n. 7
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira', False) else None

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    max_per_round = UC.get('max_per_round').get('completor', None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update(
                json.loads(eosRead('%s/stuck_transfers.json' %
                                   monitor_pub_dir)))
            break
        except:
            time.sleep(2)

    for itry in range(5):
        try:
            ## take into account the block that needed to be repositioned recently
            all_stuck.update([
                b.split('#')[0] for b in json.loads(
                    eosRead('%s/missing_blocks.json' % monitor_dir))
            ])
            break
        except:
            time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update(getAllStuckDataset())

    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {}
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print "can truncate complete on"
    print json.dumps(truncate_fractions, indent=2)
    print "can overide on"
    print json.dumps(overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = {
        110000: 21,
        90000: 28,
        #     80000 : 60,
        #0 : 90
    }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip = False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user, spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in [
                    'force-complete', 'completed'
            ]:
                if any(s in wfo.name
                       for s in spec) or (wfo.name in spec) or any(
                           pid in spec for pid in pids) or any(s in pids
                                                               for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url, wfi)
                    skip = True
                    wfi.notifyRequestor(
                        "The workflow %s was force completed by request of %s"
                        % (wfo.name, user),
                        do_batch=False)
                    wfi.sendLog(
                        'completor',
                        '%s is asking for %s to be force complete' %
                        (user, wfo.name))
                    break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask(
        )  ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task, outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask(task)
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(
                    task_campaign, 1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(
                    task_campaign, default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(
                    task_campaign, 1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        priority_log = filter(lambda change: change['Priority'] == priority,
                              wfi.request.get('PriorityTransition', []))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in priority_log]) / (60. * 60. * 24.)
            priority_delay = now - then  ## in days
            print "priority was set to", priority, priority_delay, "[days] ago"

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in running_log]) / (60. * 60. * 24.)
            delay = now - then  ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog(
            'completor',
            "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"
            % (cpuh, delay, priority, priority_delay))
        if priority_delay != 0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url,
                                    original.request['OriginalRequestName'])
        injected_log = filter(
            lambda change: change["Status"] in ["assignment-approved"],
            original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.)
            injection_delay = now - injected_on

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority

        pop_a_jira = False
        ping_on_jira = 7 * (24 * 60 * 60)  # 7 days
        for jp, jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j, reopened, just_created = JC.create_or_last(
                prepid=wfi.request['PrepID'],
                priority=wfi.request['RequestPriority'],
                label='Late',
                reopen=True)
            last_time = JC.last_time(j)
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d" % (
                    delay, priority)
                JC.comment(j.key, j_comment)

        if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000  ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority'] + int(
                (delay_priority_increase *
                 (delay_for_priority_increase - injection_delay_threshold) / 7)
                / quantized) * quantized
            tail_cutting_priority += 101  ## to signal it is from this mechanism
            tail_cutting_priority = min(
                400000, tail_cutting_priority)  ## never go above 400k priority
            tail_cutting_priority = max(
                tail_cutting_priority,
                priority)  ## never go below the current value

            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, increasing to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'bumping priority to %d for being injected since %s' %
                        (tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(
                        url, wfo.name, tail_cutting_priority)
                    max_priority -= 1
                else:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, would like to increase to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'would like to bump priority to %d for being injected since %s'
                        % (tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _, prim, _, _ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck:
            wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c, 14) for c in campaigns])

        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay:
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = {
            "delay": delay,
            "injection_delay": injection_delay
        }

        percent_completions = wfi.getCompletionFraction(caller='completor')

        if not percent_completions:
            sendLog('completor',
                    '%s has no output at all' % wfo.name,
                    level='critical')
            continue

        is_over_allowed_delay = (all([
            percent_completions[out] >= good_fraction_per_out.get(out, 1000.)
            for out in percent_completions
        ]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([
            percent_completions[out] >= truncate_fraction_per_out.get(
                out, 1000.) for out in percent_completions
        ])) and delay >= allowed_delay)
        is_over_completion = (all([
            percent_completions[out] >= good_fraction_nodelay_per_out.get(
                out, 1000.) for out in percent_completions
        ]))

        if is_over_completion:
            wfi.sendLog(
                'completor', "all is over completed %s\n %s" %
                (json.dumps(good_fraction_nodelay_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_allowed_delay:
            wfi.sendLog(
                'completor', "all is above %s \n%s" %
                (json.dumps(good_fraction_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_truncation_delay:
            wfi.sendLog(
                'completor',
                "all is above %s truncation level, and the input is stuck\n%s"
                % (json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(percent_completions, indent=2)))

        else:
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog(
                'completor',
                "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"
                % (json.dumps(percent_completions, indent=2),
                   json.dumps(good_fraction_per_out, indent=2),
                   json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(long_lasting[wfo.name]['agents'], indent=2)))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then

        ran_at = wfi.request['SiteWhitelist']

        wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay))

        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force > 0:
            print "going for force-complete of", wfo.name
            if not safe_mode:
                forceComplete(url, wfi)
                set_force_complete.add(wfo.name)
                wfi.sendLog('completor', 'going for force completing')
                wfi.notifyRequestor(
                    "The workflow %s was force completed for running too long"
                    % wfo.name)
                max_force -= 1
            else:
                sendEmail(
                    'completor',
                    'The workflow %s is ready for force complete, but completor is in safe mode'
                    % wfo.name)
        else:
            wfi.sendLog(
                'completor',
                "too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog(
            'completor', 'The followings were set force-complete \n%s' %
            ('\n'.join(set_force_complete)))

    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text = "These have been running for long"

    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json' % monitor_dir,
            'w').write(json.dumps(long_lasting, indent=2)).close()

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    print text
Esempio n. 8
0
def parse_one(url, wfn, options=None):
    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "[showError] Time check (%s) point at : %s" % (label, nows)
        print "[showError] Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "[showError] Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "[showError] Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    task_error_site_count = {}
    one_explanation = defaultdict(set)
    per_task_explanation = defaultdict(set)

    if wfn in [
            'vlimant_task_EXO-RunIISummer15wmLHEGS-04800__v1_T_170906_141738_1357'
    ]:
        return task_error_site_count, one_explanation

    time_point("Starting with %s" % wfn)
    threads = []

    SI = global_SI()
    UC = unifiedConfiguration()
    wfi = workflowInfo(url, wfn)
    time_point("wfi", sub_lap=True)
    where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo()
    time_point("acdcinfo", sub_lap=True)
    all_blocks, needed_blocks_loc, files_in_blocks, files_and_loc_notin_dbs = wfi.getRecoveryBlocks(
    )
    time_point("inputs", sub_lap=True)

    ancestor = workflowInfo(url, wfn)
    lhe, prim, _, sec = ancestor.getIO()
    high_order_acdc = 0
    while ancestor.request['RequestType'] == 'Resubmission':
        ancestor = workflowInfo(url, ancestor.request['OriginalRequestName'])
        lhe, prim, _, sec = ancestor.getIO()
        high_order_acdc += 1

    no_input = (not lhe) and len(prim) == 0 and len(sec) == 0

    cache = options.cache
    print "cache timeout", cache

    err = wfi.getWMErrors(cache=cache)
    time_point("wmerrors", sub_lap=True)
    stat = wfi.getWMStats(cache=cache)
    time_point("wmstats", sub_lap=True)
    #adcd = wfi.getRecoveryDoc()

    total_by_code_dash = defaultdict(int)
    total_by_site_dash = defaultdict(int)
    r_dashb = defaultdict(lambda: defaultdict(int))
    dash_board_h = 1
    if False:
        ## NB get the since from when the wf has started, not a fixed value
        ## no dashboard until we get a better api
        #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache)
        dashb = {}
        #print json.dumps( dashb , indent=2)
        for site, sinfo in dashb.items():
            for s_code, counts in sinfo.items():
                d_statuses = ['submitted', 'pending', 'app-unknown', 'done']
                total_by_code_dash[str(s_code)] += counts.get('submitted', 0)
                total_by_site_dash[site] += counts.get('submitted', 0)
                r_dashb[str(s_code)][site] += counts.get('submitted', 0)

        print json.dumps(total_by_code_dash, indent=2)
        print json.dumps(total_by_site_dash, indent=2)

    time_point("Got most input")

    status_per_task = defaultdict(lambda: defaultdict(int))

    if not 'AgentJobInfo' in stat:
        stat['AgentJobInfo'] = {}
        print "no information in AgentJobInfo, they agents must have been retired. I cannot go on without creating a partial report"
        return task_error_site_count, one_explanation
        #print "bad countent ?"
        #print json.dumps(  stat,  indent=2)

    for agent in stat['AgentJobInfo']:
        for task in stat['AgentJobInfo'][agent]['tasks']:
            if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]:
                continue
            for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']:
                info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][
                    status]
                #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status]
                if type(info) == dict:
                    status_per_task[task][status] += sum(
                        stat['AgentJobInfo'][agent]['tasks'][task]['status']
                        [status].values())
                else:
                    status_per_task[task][status] += stat['AgentJobInfo'][
                        agent]['tasks'][task]['status'][status]

    #print json.dumps( status_per_task, indent=2)
    db_total_per_site = defaultdict(int)
    db_total_per_code = defaultdict(int)
    ## cannot do that since there is no task count in dashboard and we have to take away the submitted
    #for site in dashb:
    #    for error in dashb[site]:
    #        db_total_per_site[site] += dashb[site][error]
    #        db_total_per_code[code] += dashb[site][error]

    print "ACDC Information"
    print "\t where to re-run"
    print json.dumps(where_to_run, indent=2)
    print "\t Missing events"
    print json.dumps(missing_to_run, indent=2)
    print "\t Missing events per site"
    print json.dumps(missing_to_run_at, indent=2)

    if not where_to_run and not missing_to_run and not missing_to_run_at:
        print "showError is unable to run"
        #return task_error_site_count, one_explanation
        pass

    do_JL = not options.no_JL
    do_CL = not options.no_CL
    do_all_error_code = options.all_errors
    if high_order_acdc >= 1:
        print high_order_acdc, "order request, pulling down all logs"
        do_all_error_code = True
    if wfi.isRelval():
        print "getting all codes for relval"
        do_all_error_code = True

    tasks = sorted(set(err.keys() + missing_to_run.keys()))

    if not tasks:
        print "no task to look at"
        #return task_error_site_count

    html = "<html> <center><h1>%s, Updated on %s (GMT)" % (
        wfn, time.asctime(time.gmtime()))

    html += '</center>'
    html += '<a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>dts</a>, ' % (
        wfn)
    html += '<a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>ac</a>, ' % (
        wfi.request['PrepID'])
    html += '<a href=https://cms-gwmsmon.cern.ch/prodview/%s>Job Progress</a>, ' % (
        wfn)
    r_type = wfi.request.get('OriginalRequestType',
                             wfi.request.get('RequestType', 'NaT'))
    if r_type in ['ReReco']:
        html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a>, ' % wfi.request[
            'PrepID']
    html += '<a href="https://its.cern.ch/jira/issues/?jql=text~%s AND project = CMSCOMPPR" target="_blank">jira</a>,' % (
        wfi.request['PrepID'])
    html += '<a href="https://vocms0113.cern.ch/seeworkflow/?workflow=%s">console</a>' % wfn
    html += '<hr>'
    html += '<a href=#IO>I/O</a>, <a href=#ERROR>Errors</a>, <a href=#BLOCK>blocks</a>, <a href=#FILE>files</a>, <a href=#CODES>Error codes</a><br>'
    html += '<hr>'

    time_point("Header writen")

    html += '<a name=IO></a>'
    if prim:
        html += 'Reads in primary<br>'
        rwl = wfi.getRunWhiteList()
        lwl = wfi.getLumiWhiteList()
        for dataset in prim:
            html += '<b>%s </b>(events/lumi ~%d)' % (
                dataset, getDatasetEventsPerLumi(dataset))
            blocks = getDatasetBlocks(dataset, runs=rwl) if rwl else None
            blocks = getDatasetBlocks(dataset, lumis=lwl) if lwl else None
            available = getDatasetBlocksFraction(url,
                                                 dataset,
                                                 only_blocks=blocks)
            html += '<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>' % available
            html += '<ul>'
            presence = getDatasetPresence(url, dataset, only_blocks=blocks)

            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul><br>'

    if sec:
        html += 'Reads in secondary<br>'
        for dataset in sec:
            presence = getDatasetPresence(url, dataset)
            html += '<b>%s</b><ul>' % dataset
            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul>'

    outs = sorted(wfi.request['OutputDatasets'])
    if outs:
        html += 'Produces<br>'
        for dataset in outs:
            presence = getDatasetPresence(url, dataset)
            html += '<b>%s </b>(events/lumi ~ %d)<ul>' % (
                dataset, getDatasetEventsPerLumi(dataset))
            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul>'

    time_point("Input checked")

    html += """
<hr><br>
<a name=ERROR></a>
<ul>
<li> <b><i>dashboard numbers over %d days</b></i>
<li> &uarr; %% with respect to total number of error in the code
<li> &rarr; %% with respect to total number of error at the site
</ul>
""" % (dash_board_h)

    html += '<br>'

    n_expose_base = options.expose  # if options else UC.get('n_error_exposed')
    print "getting", n_expose_base, "logs by default"
    if tasks:
        min_rank = min([task.count('/') for task in tasks])
    for task in tasks:
        n_expose = n_expose_base

        expose_archive_code = dict([(str(code), defaultdict(lambda: n_expose))
                                    for code in UC.get('expose_archive_code')])
        expose_condor_code = dict([(str(code), defaultdict(lambda: n_expose))
                                   for code in UC.get('expose_condor_code')])

        #print task
        task_rank = task.count('/')
        task_short = task.split('/')[-1]
        total_per_site = defaultdict(int)
        time_point("Starting with task %s" % task_short, sub_lap=True)

        notreported = 'NotReported'

        total_count = defaultdict(int)
        error_site_count = defaultdict(lambda: defaultdict(int))

        all_not_reported = set()
        for agent in stat['AgentJobInfo']:
            for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get(
                    'skipped', {}):
                info = stat['AgentJobInfo'][agent]['tasks'][task]['skipped'][
                    site]
                #print info
                all_not_reported.add(site)
                ce = SI.SE_to_CE(site)
                error_site_count[notreported][ce] += info.get(
                    'skippedFiles', 0)
                total_count[notreported] += info.get('skippedFiles', 0)

            for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get(
                    'sites', {}):

                info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][
                    site]
                for s in ['success', 'failure', 'cooloff', 'submitted']:
                    if not s in info: continue
                    data = info[s]
                    if type(data) == dict:
                        total_per_site[site] += sum(data.values())
                    else:
                        total_per_site[site] += data

        #is the task relevant to recover (discard log, cleanup)
        if any([v in task.lower() for v in ['logcol', 'cleanup']]): continue

        #total_count= defaultdict(int)
        #error_site_count = defaultdict( lambda : defaultdict(int))
        if not task in err:
            print task, "has not reported error"
            err[task] = {}
        #print err[task].keys()

        for exittype in err[task]:
            #print "\t",err[task][exittype].keys()
            for errorcode_s in err[task][exittype]:
                if errorcode_s == '0': continue
                #print "\t\t",err[task][exittype][errorcode_s].keys()
                for site in err[task][exittype][errorcode_s]:
                    ce = SI.SE_to_CE(site)
                    count = err[task][exittype][errorcode_s][site][
                        'errorCount']
                    total_count[errorcode_s] += count
                    #error_site_count[errorcode_s][site] += count
                    error_site_count[errorcode_s][ce] += count

        ## show the total
        all_sites = set()
        all_codes = set()
        for code in error_site_count:
            for site in error_site_count[code]:
                all_sites.add(site)
                if code != '0':
                    all_codes.add(code)

        s_per_code = defaultdict(int)
        for site in all_sites:
            for code in sorted(all_codes):
                s_per_code[code] += error_site_count[code][site]

        expose_top_N = UC.get('expose_top_N')
        count_top_N = min(
            sorted(s_per_code.values(),
                   reverse=True)[:expose_top_N]) if s_per_code else -1

        for exittype in err[task]:
            #print "\t",err[task][exittype].keys()
            for errorcode_s in err[task][exittype]:
                if errorcode_s == '0': continue
                #print "\t\t",err[task][exittype][errorcode_s].keys()
                force_code = (count_top_N > 0
                              and s_per_code[errorcode_s] >= count_top_N)
                if force_code: print "will expose", errorcode_s, "anyways"
                for site in err[task][exittype][errorcode_s]:
                    ce = SI.SE_to_CE(site)
                    count = err[task][exittype][errorcode_s][site][
                        'errorCount']
                    ###total_count[errorcode_s] += count
                    #error_site_count[errorcode_s][site] += count
                    ###error_site_count[errorcode_s][ce] += count
                    for sample in err[task][exittype][errorcode_s][site][
                            'samples']:
                        #print sample.keys()
                        for step in sample['errors']:
                            for report in sample['errors'][step]:
                                if report['type'] == 'CMSExeption': continue
                                #if int(report['exitCode']) == int(errorcode_s):
                                one_explanation[errorcode_s].add(
                                    "%s (Exit code: %s) \n%s" %
                                    (report['type'], report['exitCode'],
                                     report['details']))
                                per_task_explanation[
                                    "%s:%s" % (task_short, errorcode_s)].add(
                                        "%s (Exit code: %s) \n%s" %
                                        (report['type'], report['exitCode'],
                                         report['details']))
                                #one_explanation[errorcode_s].add( report['details'] )
                                #else:
                                #one_explanation[
                        agent = sample['agent_name']
                        wmbs = sample['wmbsid']
                        workflow = sample['workflow']
                        if force_code:
                            if not errorcode_s in expose_condor_code:
                                expose_condor_code[errorcode_s] = defaultdict(
                                    lambda: n_expose)
                            if not errorcode_s in expose_archive_code:
                                expose_archive_code[errorcode_s] = defaultdict(
                                    lambda: n_expose)

                        if do_CL and ((errorcode_s in expose_condor_code and
                                       expose_condor_code[errorcode_s][agent])
                                      ) and 'cern' in agent:
                            if errorcode_s in expose_condor_code:
                                expose_condor_code[errorcode_s][agent] -= 1
                            print errorcode_s, agent, "error count", expose_condor_code.get(
                                errorcode_s, {}).get(agent, 0)

                            threads.append(
                                AgentBuster(agent=agent,
                                            workflow=workflow,
                                            wmbs=wmbs,
                                            errorcode_s=errorcode_s,
                                            base_eos_dir=base_eos_dir,
                                            monitor_eos_dir=monitor_eos_dir,
                                            task_short=task_short))

                        for out in sample['output']:
                            #print out
                            if out['type'] == 'logArchive':
                                if do_JL and (
                                    (errorcode_s in expose_archive_code and
                                     expose_archive_code[errorcode_s][agent] >
                                     0)):
                                    if errorcode_s in expose_archive_code:
                                        expose_archive_code[errorcode_s][
                                            agent] -= 1
                                    print errorcode_s, agent, "error count", expose_archive_code.get(
                                        errorcode_s, {}).get(agent, 0)

                                    threads.append(
                                        XRDBuster(
                                            out_lfn=out['lfn'],
                                            monitor_eos_dir=monitor_eos_dir,
                                            wfn=wfn,
                                            errorcode_s=errorcode_s,
                                            task_short=task_short,
                                            from_eos=(
                                                not options.not_from_eos
                                            ),  # if options else True),
                                        ))

        #print task
        #print json.dumps( total_count, indent=2)
        #print json.dumps( explanations , indent=2)
        all_sites = set()
        all_codes = set()
        for code in error_site_count:
            for site in error_site_count[code]:
                all_sites.add(site)
                if code != '0':
                    all_codes.add(code)

        ## parse the dashboard data
        for site in total_by_site_dash:
            ## no. cannot discriminate by task in dashboard...
            #all_sites.add( site )
            pass

        ## parse the acdc data
        #notreported='NotReported'
        #all_missing_stats = set()
        #for site in missing_to_run_at[task] if task in missing_to_run_at else []:
        #    if not missing_to_run_at[task][site]: continue
        #    ce = SI.SE_to_CE( site )
        #    #all_sites.add( ce )
        #    all_missing_stats.add( ce )

        #all_missing_stats = all_missing_stats &set(SI.all_sites)
        #all_not_reported = all_missing_stats - all_sites
        #print task
        #print "site with no report",sorted(all_not_reported)
        #print sorted(all_sites)
        #print sorted(all_missing_stats)
        #all_sites = all_missing_stats | all_sites

        #all_sites = all_sites & set(SI.all_sites)

        no_error = len(all_not_reported) != 0

        if not no_error and notreported in all_codes:
            all_codes.remove(notreported)
        missing_events = missing_to_run[task] if task in missing_to_run else 0
        feff = wfi.getFilterEfficiency(task.split('/')[-1])
        html += "<a name=%s>" % task.split('/')[-1]
        html += "<b>%s</b>" % task.split('/')[-1]
        if missing_events:
            if feff != 1.:
                html += ' is missing %s events in input and <b>about %s events in output</b>' % (
                    "{:,}".format(missing_events), "{:,}".format(
                        int(missing_events * feff)))
            else:
                html += ' is missing <b>%s events in I/O</b>' % (
                    "{:,}".format(missing_events))

            html += ' <a href="https://cmsweb.cern.ch/couchdb/acdcserver/_design/ACDC/_view/byCollectionName?key=%%22%s%%22&include_docs=true&reduce=false" target=_blank>AC/DC</a>' % (
                wfn)
            if no_error:
                html += "<br><b><font color=red> and has UNreported error</font></b>"

        html += "<br><table border=1><thead><tr><th>Sites/Errors</th>"

        #for site in all_sites:
        #    html+='<th>%s</th>'%site
        for code in sorted(all_codes):
            #html+='<th><a href="#%s">%s</a>'%(code,code)
            html += '<th><a href="#%s:%s">%s</a>' % (task_short, code, code)
            if (str(code) in expose_archive_code
                    or do_all_error_code):  # and n_expose_base:
                html += ' <a href=%s/joblogs/%s/%s/%s>, JobLog</a>' % (
                    unified_url_eos, wfn, code, task_short)
            if (str(code) in expose_condor_code
                    or do_all_error_code):  # and n_expose_base:
                html += ' <a href=%s/condorlogs/%s/%s/%s>, CondorLog</a>' % (
                    unified_url_eos, wfn, code, task_short)
            html += '</th>'

        html += '<th>Total jobs</th><th>Site Ready</th>'
        html += '</tr></thead>\n'

        html += '<tr><td>Total</td>'
        for code in sorted(all_codes):
            html += '<td bgcolor=orange width=100>%d' % (s_per_code[code])
            if code in total_by_code_dash:
                html += ' (<b><i>%d</i></b>)' % total_by_code_dash[code]
            html += '</td>'

        ulist = '<ul>'
        grand = 0
        for status in sorted(status_per_task[task].keys()):
            ulist += '<li> %s %d' % (status, status_per_task[task][status])
            grand += status_per_task[task][status]
        ulist += '<li><b> Total %d </b>' % grand
        ulist += '</ul>'
        #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.)
        html += '<td bgcolor=orange> &rarr; %.2f%% &larr; </td>' % (
            100. * (float(sum(s_per_code.values())) / grand) if grand else 0.)
        html += '<td bgcolor=orange> %s </td>' % ulist

        html += '</tr>'

        def palette(frac):
            _range = {
                0.0: 'green',
                0.5: 'green',
                0.6: 'darkgreen',
                0.7: 'orange',
                0.8: 'salmon',
                0.9: 'red'
            }
            which = [k for k in _range.keys() if k <= frac]
            if which:
                there = max(which)
            else:
                there = max(_range.keys())
            return _range[there]

        for site in sorted(all_sites):
            site_in = 'Yes'
            color = 'bgcolor=lightblue'
            if not site in SI.sites_ready:
                color = 'bgcolor=indianred'
                site_in = '<b>No</b>'
                if task in missing_to_run_at and missing_to_run_at[task][
                        SI.CE_to_SE(site)] == 0 or min_rank == task_rank:
                    color = 'bgcolor=aquamarine'
                    site_in = '<b>No</b> but fine'

            if not no_error:
                site_in += " (%s events)" % ("{:,}".format(
                    missing_to_run_at[task][SI.CE_to_SE(site)]) if task
                                             in missing_to_run_at else '--')
            html += '<tr><td %s>%s</td>' % (color, site)
            for code in sorted(all_codes):
                if code == notreported:
                    html += '<td %s width=200>%s events </td>' % (
                        color, "{:,}".format(
                            missing_to_run_at[task][SI.CE_to_SE(site)]))
                else:
                    if error_site_count[code][site]:
                        er_frac = float(
                            error_site_count[code][site]
                        ) / s_per_code[code] if s_per_code[code] else 0.
                        si_frac = float(
                            error_site_count[code][site]) / total_per_site[
                                site] if total_per_site[site] else 0.
                        html += '<td %s width=200>%d' % (
                            color, error_site_count[code][site])
                        if code in r_dashb and site in r_dashb[code]:
                            html += ' (<b><i>%d</i></b>)' % (
                                r_dashb[code][site])

                        html += ', <font color=%s>&uarr; %.1f%%</font>, <font color=%s>&rarr; %.1f%%</font></td>' % (
                            palette(er_frac), 100. * er_frac, palette(si_frac),
                            100. * si_frac)
                    else:
                        html += '<td %s>0</td>' % color
            html += '<td bgcolor=orange>%d</td>' % total_per_site[site]
            html += '<td %s>%s</td>' % (color, site_in)
            html += '</tr>\n'
        html += '</table><br>'
        task_error_site_count[task] = error_site_count

    ## run all retrieval
    run_threads = ThreadHandler(
        threads=threads,
        n_threads=options.log_threads,  # if options else 5,
        sleepy=10,
        timeout=UC.get('retrieve_errors_timeout'),
        verbose=True)
    run_threads.start()

    html += '<hr><br>'
    html += '<a name=BLOCK></a>'
    html += "<b>Blocks (%d/%d) needed for recovery</b><br>" % (
        len(needed_blocks_loc), len(all_blocks))
    for block in sorted(needed_blocks_loc.keys()):
        html += '%s <b>@ %s</b><br>' % (block, ','.join(
            sorted(needed_blocks_loc[block])))

    html += '<a name=FILE></a>'
    html += "<br><b>%s Files in no block</b><br>" % (len(
        files_and_loc_notin_dbs.keys()))
    rthreads = []
    check_files = [f for f in files_and_loc_notin_dbs.keys() if '/store' in f]
    random.shuffle(check_files)
    #check_files = check_files[:100]
    check_files = []
    by_f = {}
    f_locations = defaultdict(set)
    if check_files:
        import dynamoClient
        DC = dynamoClient.dynamoClient()
        dirs_by_site = defaultdict(set)
        for f in check_files:
            dir, fn = f.rsplit('/', 1)
            for s in files_and_loc_notin_dbs[f]:
                dirs_by_site[s].add(dir)
        files_by_site = DC.files_in_dir(dirs_by_site)
        #print dirs_by_site
        #print files_by_site

        for f in check_files:
            locs = [s for s in files_by_site if f in files_by_site[s]]
            if locs:
                by_f[f] = True
                f_locations[f].update(locs)
            else:
                by_f[f] = False
        """
        for f in check_files:
            rthreads.append( ReadBuster( file = f ))
        print "checking on existence of",len(rthreads),"files"
        run_rthreads = ThreadHandler( threads = rthreads, n_threads = 20, timeout = 10)
        run_rthreads.start()
        while run_rthreads.is_alive():
            time.sleep(10)

        for t in run_rthreads.threads:
            by_f[t.file] = t.readable
            #print "checked",t.file,t.readable
        """
    files_html = ""
    existing_html = ""
    lost_html = ""
    separate_h = False
    missing_files = defaultdict(int)
    expected_files = defaultdict(int)
    max_number_of_files = 500
    display_files = sorted(files_and_loc_notin_dbs.keys())
    display_files = display_files[:
                                  max_number_of_files] if max_number_of_files else display_files

    for f in display_files:
        readable = by_f.get(f, -1)
        if readable == -1 or not 'store' in f:
            fs = '%s' % f
            sites_strs = sorted(files_and_loc_notin_dbs[f])
        else:
            for s in files_and_loc_notin_dbs[f]:
                expected_files[s] += 1
            if readable == True:
                fs = '<font color="light green">%s</font>' % f
                #print f,"is readable"
            else:
                fs = '<font color=red>%s</font>' % f
                #print f,"is not readable"
                for s in files_and_loc_notin_dbs[f]:
                    missing_files[s] += 1

            sites_strs = [
                '<font color="%s">%s</font>' %
                ('light green' if s in f_locations[f] else 'red', s)
                for s in sorted(files_and_loc_notin_dbs[f])
            ]
            #seen_at = sorted(f_locations[f])

        html_line = '%s <b>@</b> %s<br>' % (
            fs,
            ','.join(sites_strs),
            #','.join(seen_at)
        )
        if not separate_h:
            files_html += html_line
        if readable == False:
            lost_html += html_line
        else:
            existing_html += html_line
    html += "<br><table border=1><thead><tr><td>Site</td><td>Expected files</td><td>Missing files</td></tr></thead>"
    for s in sorted(expected_files.keys()):
        if missing_files[s] or True:
            html += "<tr bgcolor=%s><td>%s</td><td>%d</td><td>%d</td></tr>" % (
                "red" if missing_files[s] else "", s, expected_files[s],
                missing_files[s])
    html += "</table><br>"

    if separate_h:
        html += existing_html
        html += lost_html
    else:
        html += files_html

    html += '<hr><br>'
    html += '<a name=CODES></a>'
    html += '<table border=1>'
    for code in per_task_explanation:
        html += '<tr><td><a name="%s">%s</a><br><a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes>code twiki</a></td><td>%s</td></tr>' % (
            code, code, '<br><br>'.join(per_task_explanation[code]).replace(
                '\n', '<br>'))
    #for code in one_explanation:
    #    html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' ))

    html += '</table>'
    html += ('<br>' * 30)
    html += '</html>'
    time_point("Report finished")
    wfi.sendLog('error', html, show=False)
    fn = '%s' % wfn

    time_point("error send to ES")
    #open('%s/report/%s'%(monitor_dir,fn),'w').write( html )
    #open('%s/report/%s'%(monitor_eos_dir,fn),'w').write( html )
    #eosFile('%s/report/%s'%(monitor_dir,fn),'w').write( html ).close()
    eosFile('%s/report/%s' % (monitor_eos_dir, fn), 'w').write(html).close()

    time_point("Finished with showError")

    ## then wait for the retrivals to complete
    ping = 0
    while run_threads.is_alive():
        ping += 1
        if ping % 100:
            time_point("waiting for sub-threads to finish")
        time.sleep(6)

    time_point("Finished with retrieval threads")

    return task_error_site_count, one_explanation
Esempio n. 9
0
def stuckor(url=reqmgr_url):
    mlock = moduleLock()
    if mlock(): return
    TD = transferDataset()
    datasets_by_phid = TD.content()
    really_stuck_dataset = set(
        json.loads(eosRead('%s/really_stuck_dataset.json' % base_eos_dir)))

    UC = unifiedConfiguration()

    print "make a report of stuck transfers"
    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid, datasets in datasets_by_phid.items():
        issues = checkTransferLag(url, phid, datasets=list(datasets))
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size, destination_size, delay, rate,
                     dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones = []
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer',
                                         'MSS').replace('Export',
                                                        'MSS') in dones:
                                continue
                            else:
                                redones.append(d)
                        else:
                            redones.append(d)
                    dones = list(set(redones))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay > transfer_timeout and rate < transfer_lowrate:
                        if len(dones) > 1:
                            ## its the destination that sucks
                            bad_destinations[destination].add(block)
                        else:
                            dum = [bad_sources[d].add(block) for d in dones]
                        really_stuck_dataset.add(dataset)
                        print "add", dataset, "to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n" % (
                            block, destination, ", ".join(dones), rate, delay)
    print "\n" * 2

    ## create tickets right away ?
    report += "\nbad sources " + ",".join(bad_sources.keys()) + "\n"
    for site, blocks in bad_sources.items():
        report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks))
    report += "\nbad destinations " + ",".join(bad_destinations.keys()) + "\n"
    for site, blocks in bad_destinations.items():
        report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks))

    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    print '\n' * 2, "report written at %s/logs/incomplete_transfers.log" % unified_url
    print report

    missing_in_action = json.loads(
        eosRead('%s/incomplete_transfers.json' % monitor_dir))
    stuck_transfers = dict([(k, v) for (k, v) in missing_in_action.items()
                            if k in really_stuck_dataset])
    print '\n' * 2, 'Stuck dataset transfers'
    print json.dumps(stuck_transfers, indent=2)
    eosFile('%s/stuck_transfers.json' % monitor_pub_dir,
            'w').write(json.dumps(stuck_transfers, indent=2)).close()
    eosFile('%s/logs/incomplete_transfers.log' % monitor_dir,
            'w').write(report).close()
Esempio n. 10
0
            actors = getWorkflowByOutput( url, dataset , details=True)
            #actors = [wfc['doc'] for wfc in by_output if wfc['key']==dataset]
            using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
            if len(using_actors):remainings[site][dataset]["reasons"].append('output')
            actors = getWorkflowByMCPileup( url, dataset , details=True)
            #actors = [wfc['doc'] for wfc in by_pileup if wfc['key']==dataset]
            using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
            if len(using_actors):remainings[site][dataset]["reasons"].append('pilup')
        
            print dataset,remainings[site][dataset]["reasons"]

        #print "\t",sum_waiting,"[GB] could be freed by custodial"
        #print "\t",sum_unlocked,"[GB] is not locked by me"

        #open('%s/remaining_%s.json'%(monitor_dir,site),'w').write( json.dumps( remainings[site] , indent=2))
        eosFile('%s/remaining_%s.json'%(monitor_dir,site),'w').write( json.dumps( remainings[site] , indent=2)).close()
        

        ld = remainings[site].items()
        ld.sort( key = lambda i:i[1]['size'], reverse=True)
        table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>"%(time.asctime(time.gmtime()),site)

        accumulate = defaultdict(lambda : defaultdict(float))
        for item in remainings[site]:
            tier = item.split('/')[-1]

            for reason in remainings[site][item]['reasons']:
                accumulate[reason][tier] += remainings[site][item]['size']
        table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>"
        for reason in accumulate:
            s=0
Esempio n. 11
0
    def run(self):
        site = self.site
        print "checking on site",site
        si = self.SI
        UC = self.UC
        RDI = self.RDI
        options = self.options
        locks = self.locks
        waiting = self.waiting
        stuck = self.stuck
        missing = self.missing
        remainings = {}
        
        ds = si.getRemainingDatasets(si.CE_to_SE(site))
        #print len(ds)
        taken_size=0.
        sum_waiting=0.
        sum_stuck=0.
        sum_missing=0.
        sum_unlocked=0.
        n_ds = options.ndatasets
        i_ds = 0
        ds_threads = []
        for i_ds,(size,dataset) in enumerate(ds):
            if n_ds and i_ds>=n_ds: break
            remainings[dataset] = {"size" : size, "reasons": []}
            #print "-"*10
            if not dataset in locks:
                #print dataset,"is not locked"
                sum_unlocked += size
                remainings[dataset]["reasons"].append('unlock')
            else:
                remainings[dataset]["reasons"].append('lock')
            if dataset in waiting:
                #print dataset,"is waiting for custodial"
                sum_waiting+=size
                remainings[dataset]["reasons"].append('tape')

            if dataset in stuck:
                sum_stuck+=size
                remainings[dataset]["reasons"].append('stuck-tape')
            if dataset in missing:
                sum_missing +=size
                remainings[dataset]["reasons"].append('missing-tape')

            ds_threads.append( DatasetCheckBuster( dataset = dataset,
                                                   url = url))

        
        run_threads = ThreadHandler( threads = ds_threads,
                                     label = '%s Dataset Threads'%site,
                                     n_threads = 10 ,
                                     start_wait = 0,
                                     timeout = None,
                                     verbose=True)
        ## start and sync
        run_threads.run()
        #run_threads.start()
        #while run_threads.is_alive():
        #    time.sleep(10)        

        for t in run_threads.threads:
            remainings[t.dataset]["reasons"].extend( t.reasons )
            remainings[t.dataset]["reasons"].sort()
            print t.dataset,remainings[t.dataset]["reasons"]

        #print "\t",sum_waiting,"[GB] could be freed by custodial"
        print "\t",sum_unlocked,"[GB] is not locked by unified"

        print "updating database with remaining datasets"
        RDI.set(site, remainings)
        try:
            eosFile('%s/remaining_%s.json'%(monitor_dir,site),'w').write( json.dumps( remainings , indent=2)).close()
        except:
            pass

        ld = remainings.items()
        ld.sort( key = lambda i:i[1]['size'], reverse=True)
        table = "<html>Updated %s GMT, <a href=remaining_%s.json>json data</a><br>"%(time.asctime(time.gmtime()),site)

        accumulate = defaultdict(lambda : defaultdict(float))
        for item in remainings:
            tier = item.split('/')[-1]

            for reason in remainings[item]['reasons']:
                accumulate[reason][tier] += remainings[item]['size']
        table += "<table border=1></thead><tr><th>Reason</th><th>size [TB]</th></thead>"
        for reason in accumulate:
            s=0
            table += "<tr><td>%s</td><td><ul>"% reason
            subitems = accumulate[reason].items()
            subitems.sort(key = lambda i:i[1], reverse=True)

            for tier,ss in subitems:
                table += "<li> %s : %10.3f</li>"%( tier, ss/1024.)
                s+=  ss/1024.
            table+="</ul>total : %.3f</td>"%s

        table += "</table>\n"
        table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n"
        only_unlock = set()
        for item in ld:
            ds_name = item[0]
            reasons = item[1]['reasons']
            sub_url = '<a href="https://cmsweb.cern.ch/das/request?input=%s">%s</a>'%(ds_name, ds_name)
            if 'unlock' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?block=%s%%23*&node=%s">block</a>'%(ds_name, site)
            if 'unlock' in reasons or 'input' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?inputdataset=%s&mask=RequestName&mask=RequestStatus">input</a>'%(ds_name)
            if 'unlock' in reasons or 'output' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s&mask=RequestName&mask=RequestStatus">output</a>'%(ds_name)
            if 'pilup' in reasons:
                sub_url += ', <a href="https://cmsweb.cern.ch/reqmgr2/data/request?mc_pileup=%s&mask=RequestName&mask=RequestStatus">secondary</a>'%(ds_name)                
            table+="<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n"%( sub_url, item[1]['size'], "<li>".join([""]+reasons))
            if reasons==['unlock']:
                only_unlock.add(item[0])
        table+="</table></html>"
        eosFile('%s/remaining_%s.html'%(monitor_dir,site),'w').write( table ).close()

        print "checking on unlock only datasets"
        to_ddm = UC.get('tiers_to_DDM')
        #look_at = list(only_unlock)
        look_at = list(only_unlock)[:20]
        #look_at = list([ds for ds in only_unlock if not ds.endswith('NANOAODSIM')])
        for item in look_at:
            tier = item.split('/')[-1]
            ds_status = getDatasetStatus(item)
            print item,ds_status
            if ds_status == 'PRODUCTION':
                print item,"is found",ds_status,"and unklocked on",site
                if options.invalidate_anything_left_production_once_unlocked:
                    print "Setting status to invalid for",item
                    setDatasetStatus(item, 'INVALID')
            if tier in to_ddm:
                print item,"looks like analysis and still dataops on",site
                if options.change_dataops_subs_to_anaops_once_unlocked:
                    print "Sending",item,"to anaops"
                    allCompleteToAnaOps(url, item)
Esempio n. 12
0
def stuckor(url = reqmgr_url):
    mlock = moduleLock()
    if mlock(): return
    TD = transferDataset()
    datasets_by_phid = TD.content()
    really_stuck_dataset = set(json.loads(eosRead('%s/really_stuck_dataset.json'%base_eos_dir)))

    UC = unifiedConfiguration()

    print "make a report of stuck transfers"
    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid,datasets in datasets_by_phid.items():
        issues = checkTransferLag( url, phid , datasets=list(datasets) )
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones=[]
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer','MSS').replace('Export','MSS') in dones: 
                                continue
                            else: 
                                redones.append( d )
                        else:
                            redones.append( d )
                    dones = list(set( redones ))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay>transfer_timeout and rate<transfer_lowrate:
                        if len(dones)>1:
                            ## its the destination that sucks
                            bad_destinations[destination].add( block )
                        else:
                            dum=[bad_sources[d].add( block ) for d in dones]
                        really_stuck_dataset.add( dataset )
                        print "add",dataset,"to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay)
    print "\n"*2

    ## create tickets right away ?
    report+="\nbad sources "+",".join(bad_sources.keys())+"\n"
    for site,blocks in bad_sources.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))
    report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n"
    for site,blocks in bad_destinations.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))

    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    print '\n'*2,"report written at %s/logs/incomplete_transfers.log"%unified_url
    print report

    missing_in_action = json.loads(eosRead('%s/incomplete_transfers.json'%monitor_dir))
    stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset])
    print '\n'*2,'Stuck dataset transfers'
    print json.dumps(stuck_transfers , indent=2)
    eosFile('%s/stuck_transfers.json'%monitor_pub_dir,'w').write( json.dumps(stuck_transfers , indent=2) ).close()
    eosFile('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report ).close()
Esempio n. 13
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}


    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost_blocks = json.loads(eosRead('%s/lost_blocks_datasets.json'%monitor_dir))
    lost_files = json.loads(eosRead('%s/lost_files_datasets.json'%monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys()+lost_files.keys()):
        b,f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset,"has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f: 
            print dataset,"has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]


    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())


    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']:
            wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus'])
            wfo.status='away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfi.isRelval():
                wfo.status='forget'
                session.commit()
                continue
            else:
                print wfo.name,"is",wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor","%s is in %s, set away"%(wfo.name,wfi.request['RequestStatus']), level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append( (wfo,wfi) )            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            needs[wfo.name].append( dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append( dataset )
            wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset))

    time_point("Check staging workflows")            

    #open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2))
    eosFile('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)).close()
    for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    #open('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2))
    eosFile('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2)).close()
        

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()


    ## first check if anything is inactive
    all_actives = set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip =False
                sendLog('stagor',"\t%s is staging for %s"%(imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor',"setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor',"read %s from cache"%phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor','Performing spurious transfer check on %s'% phedexid, level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog('stagor','Stagor has got a skewed input from checkTransferStatus', level='critical')
                checks = {}
                pass
            else:
                TS.add( phedexid, checks)                

        time_point("Check transfer status %s"% phedexid, sub_lap=True)            

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid]=checks[dsname].values()
        if checks:
            sendLog('stagor',"Checks for %s are %s"%( phedexid, [node.values() for node in checks.values()]))
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ",phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][phedexid]=done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s,v in checks[ds].items():
                dataset_endpoints[ds].add( s )

        if done:
            sendLog('stagor',"%s is done"%phedexid)
            TS.add( phedexid, checks)
        else:
            sendLog('stagor',"%s is not finished %s"%(phedexid, pprint.pformat( checks )))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough]
                sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]



                if sites_incomplete_down:
                    sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update( sites_incomplete_down )                    
                if sites_incomplete:
                    endpoint_incompleted[ds].update( sites_incomplete )

            

    time_point("Check on-going transfers")            


    print "End points"
    for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps( dataset_endpoints , indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps( endpoint_in_downtime , indent=2)    

    print "End point incomplete in down time"
    for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps( endpoint_incompleted , indent=2)        


    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir) ).keys()
    already_stuck.extend( getAllStuckDataset() )
 
    missing_in_action = defaultdict(list)


    print "-"*10,"Checking on workflows in staging","-"*10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda : defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo,wfi in wfois:
        print "#"*30
        time_point("Forward checking %s"% wfo.name,sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys={}
        for need in list(primaries)+list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor',"missing transfer report for %s"%need)
                readys[need] = False      
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report','%s does not have a transfer report'%(need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need)
                readys[need] = True
                done_by_input[need] = { "fake" : True }
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor',"%s is ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = True
            else:
                wfi.sendLog('stagor',"%s is not ready \n%s.\nWaiting for the following:"%(need,json.dumps( done_by_input[need] , indent=2)))
                for request_id in done_by_input[need]:
                    if not done_by_input[need][request_id]:
                        wfi.sendLog('stagor',"https://cmsweb.cern.ch/phedex/prod/Request::View?request={}".format(request_id))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',"all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor',"all needs are fullfilled, already")
                print json.dumps( readys, indent=2 )
        else:
            wfi.sendLog('stagor',"missing requirements")
            copies_needed_from_CPUh, _ = wfi.getNCopies()
            
            copies_needed = copies_needed_from_CPUh
            if 'Campaign' in wfi.request and wfi.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfi.request['Campaign']]:
                copies_needed_from_campaign = CI.campaigns[wfi.request['Campaign']]['maxcopies']
                copies_needed = min(copies_needed_from_campaign, copies_needed_from_CPUh)

            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):#+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime\n%s"%(need, endpoint_in_downtime[need] ))
                    re_transfer=True
                
                if not se_allowed_key in available_cache[need]:
                    available_cache[need][se_allowed_key]  = getDatasetBlocksFraction( url , need, sites=se_allowed )
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need)
                        jump_ahead = True
                    else:
                        wfi.sendLog('stagor',"Available {} times. {} needed".format(available_cache[need][se_allowed_key], copies_needed))
                        missing_and_downtime = list(set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog('stagor',"%s is incomplete at %s which is in downtime, trying to move along"%(need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog('stagor',"continue waiting for transfers for optimum production performance.")



            ## compute a time since staging to filter jump starting ?                    
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries)+list(secondaries):
                if need in already_stuck: 
                    wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need)
                    jump_ahead = True
                    
            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead"%wfo.name
                details_text += '\n%s wants %s copies'%(wfo.name,copies_needed)
                copies_needed = max(1,copies_needed-1)
                details_text += '\nlowering by one unit to %s'%copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True
                
                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update( presence.keys() )
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog('stagor', "%s is available %s times (%s), at %s"%( need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog('stagor',"%s is this much transfered %s"%(need, json.dumps(done_by_input[need], indent=2)))
                    all_check&= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:    
                    wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name,"has to wait a bit more"
                    wfi.sendLog('stagor',"needs to wait a bit more")
            else:
                wfi.sendLog('stagor',"not checking availability")

            if re_transfer:
                wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime")
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add( wfo.name )


    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical')

    print "-"*10,"Checking on non-available datasets","-"*10    
    ## now check on those that are not fully available
    
    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min( available_cache[dsname].values() )

    really_stuck_dataset = set()

    for dsname,available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical')
                    else:
                        print "would send",wf.name,"back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete",dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) if (not dsname.endswith('/RAW')) else ([],[])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names)
                print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss)
                if fraction_loss > 0.05: ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos: 
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog('stagor', '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical')
                        known_lost_blocks[dsname] = [i['name'] for i in lost_blocks]
                really_stuck_dataset.add( dsname )
                  
            if lost_files:
                fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names)
                print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss
                
                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove    
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical')
                        known_lost_files[dsname] = [i['name'] for i in lost_files]

                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",len(done_by_input[dsname])
            print "\tgot",done_by_input[dsname].values().count(True)
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )
        


    eosFile('%s/lost_blocks_datasets.json'%monitor_dir,'w').write( json.dumps( known_lost_blocks, indent=2)).close()
    eosFile('%s/lost_files_datasets.json'%monitor_dir,'w').write( json.dumps( known_lost_files, indent=2)).close()
    eosFile('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ).close()

    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )


    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add( k , list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json'%base_eos_dir,'w').write( json.dumps(list(really_stuck_dataset), indent=2 )).close()
    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    #############
    ## not going further for what matters
    #############
    return 
Esempio n. 14
0
def parse_top(url, options=None):
    UC = unifiedConfiguration()
    top_N = UC.get('full_report_top_N')
    diagnose_by_agent_by_site = defaultdict(
        lambda: defaultdict(lambda: defaultdict(dict)))
    wm = dataCache.get('wmstats')

    for wfn in wm.keys():
        ## filter by runn*
        if not wm[wfn]['RequestStatus'] in [
                'running-open',
                'running-closed',
                #'completed',
        ]:
            continue
        info = wm[wfn].get('AgentJobInfo', {})
        for agent, ai in info.items():
            an = agent.split('.')[0]
            for task, ti in ai['tasks'].items():
                for site, si in ti.get('sites', {}).items():
                    ssi = condensed(si)
                    diagnose_by_agent_by_site[task][an][site] = ssi

    diagnose = defaultdict(dict)  ## the overall picture of the task
    diagnose_by_site = defaultdict(lambda: defaultdict(dict))
    diagnose_by_agent = defaultdict(lambda: defaultdict(dict))

    for task in diagnose_by_agent_by_site:
        if '_ACDC' in task: continue
        if '_RVCMSSW' in task: continue
        for agent in diagnose_by_agent_by_site[task]:
            for site in diagnose_by_agent_by_site[task][agent]:
                diagnose_by_site[task][site] = add_condensed(
                    diagnose_by_site[task][site],
                    diagnose_by_agent_by_site[task][agent][site])
                diagnose_by_agent[task][agent] = add_condensed(
                    diagnose_by_agent[task][agent],
                    diagnose_by_agent_by_site[task][agent][site])
                diagnose[task] = add_condensed(
                    diagnose[task],
                    diagnose_by_agent_by_site[task][agent][site])

    ##include fractions
    for t, ti in diagnose.items():
        diagnose[t].update(ratios(ti))
    for t in diagnose_by_site:
        for s, ti in diagnose_by_site[t].items():
            diagnose_by_site[t][s].update(ratios(ti))
    for t in diagnose_by_agent:
        for a, ti in diagnose_by_agent[t].items():
            diagnose_by_agent[t][a].update(ratios(ti))
    for t in diagnose_by_agent_by_site:
        for a in diagnose_by_agent_by_site[t]:
            for s, ti in diagnose_by_agent_by_site[t][a].items():
                diagnose_by_agent_by_site[t][a][s].update(ratios(ti))

    #top_cooloff = dict(sorted([(t,i.get('cooloff',0)) for t,i in diagnose.items()], key = lambda o:o[1], reverse=True)[:top_N])
    #top_failure = dict(sorted([(t,i.get('failure',0)) for t,i in diagnose.items()], key = lambda o:o[1], reverse=True)[:top_N])
    top_cooloff = dict(
        sorted([(t, i.get('cooloff_f', 0)) for t, i in diagnose.items()],
               key=lambda o: o[1],
               reverse=True)[:top_N])
    top_failure = dict(
        sorted([(t, i.get('failure_f', 0)) for t, i in diagnose.items()],
               key=lambda o: o[1],
               reverse=True)[:top_N])

    all_bad_wfs = set()
    all_bad_wfs.update([t.split('/')[1] for t in top_cooloff.keys()])
    all_bad_wfs.update([t.split('/')[1] for t in top_failure.keys()])

    print "found", len(all_bad_wfs), "to parse for detailled error report"
    parse_those(url, options, all_bad_wfs)

    #ht = open('%s/toperror.html'%monitor_eos_dir, 'w')
    ht = eosFile('%s/toperror.html' % monitor_eos_dir, 'w')
    ht.write("""<html>
Report of workflows with top %s error in failure and cooloff<br>
Last updated on %s (GMT)

<table border=1>
<thead><tr><th>Workflow</th><th>Task</th><th>Success</th><th>Failures</th><th>Fail.Frac</th><th>Cooloffs</th><th>Cool.Frac</th><th> Task Error Report </th> </tr></thead>
""" % (top_N, time.asctime(time.gmtime())))

    ## sort by max errors
    tops = defaultdict(int)
    for wf in sorted(all_bad_wfs):
        for owf, N in top_cooloff.items():
            if wf in owf:
                tops[wf] += N
        for owf, N in top_failure.items():
            if wf in owf:
                tops[wf] += N

    for iw, (wf, count) in enumerate(
            sorted(tops.items(), key=lambda o: o[1], reverse=True)):
        report = set()
        for owf, N in top_cooloff.items():
            if wf in owf:
                report.add(owf)
        for owf, N in top_failure.items():
            if wf in owf:
                report.add(owf)

        lcol = 'bgcolor=%s' % ('white' if iw % 2 == 0 else 'lightblue')
        for task in sorted(report):
            tdiag = diagnose.get(task, {})
            print task
            print tdiag
            ht.write(
                '<tr %s><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
                % (lcol, wf, task.split('/')[-1], tdiag.get('success', '-'),
                   tdiag.get('failure', '-'), tdiag.get('failure_f', '-'),
                   tdiag.get('cooloff', '-'), tdiag.get('cooloff_f', '-'),
                   '<a href=%s/report/%s#%s> report </a>' %
                   (unified_url, wf, task.split('/')[-1])))
    ht.write('</table></html>')
    ht.close()
Esempio n. 15
0
        else:
            print "\nrelocking", dataset
            newly_locking.add(dataset)

        time_point("Checked all")
    except Exception as e:
        print "Error in checking unlockability. relocking", dataset
        print str(e)
        newly_locking.add(dataset)

## just for a couple of rounds
waiting_for_custodial = {}
stuck_custodial = {}
lagging_custodial = {}
missing_approval_custodial = {}
eosFile('%s/waiting_custodial.json' % monitor_dir,
        'w').write(json.dumps(waiting_for_custodial, indent=2)).close()
eosFile('%s/stuck_custodial.json' % monitor_pub_dir,
        'w').write(json.dumps(stuck_custodial, indent=2)).close()
eosFile('%s/lagging_custodial.json' % monitor_dir,
        'w').write(json.dumps(lagging_custodial, indent=2)).close()
eosFile('%s/missing_approval_custodial.json' % monitor_dir,
        'w').write(json.dumps(missing_approval_custodial, indent=2)).close()

## then for all that would have been invalidated from the past, check whether you can unlock the wf based on output
for wfo in session.query(Workflow).filter(Workflow.status == 'forget').all():
    wfi = workflowInfo(url, wfo.name)
    if all([o not in newly_locking for o in wfi.request['OutputDatasets']
            ]) and not 'unlock' in wfo.status:
        wfo.status += '-unlock'
        print "then setting", wfo.name, "to", wfo.status
    session.commit()
Esempio n. 16
0
def stagor(url, specific=None, options=None):

    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0

    lost_blocks = json.loads(
        eosRead('%s/lost_blocks_datasets.json' % monitor_dir))
    lost_files = json.loads(
        eosRead('%s/lost_files_datasets.json' % monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys() + lost_files.keys()):
        b, f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset, "has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f:
            print dataset, "has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s" % (label, nows)
        print "Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in [
                'running-open', 'running-closed', 'completed', 'assigned',
                'acquired'
        ]:
            wfi.sendLog('stagor',
                        "is in status %s" % wfi.request['RequestStatus'])
            wfo.status = 'away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in [
                    'rejected', 'aborted', 'aborted-completed',
                    'aborted-archived', 'rejected-archived'
            ] and wfi.isRelval():
                wfo.status = 'forget'
                session.commit()
                continue
            else:
                print wfo.name, "is", wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor",
                        "%s is in %s, set away" %
                        (wfo.name, wfi.request['RequestStatus']),
                        level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append((wfo, wfi))
        _, primaries, _, secondaries = wfi.getIO()
        for dataset in list(primaries) + list(secondaries):
            needs[wfo.name].append(dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append(dataset)
            wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset))

    time_point("Check staging workflows")

    open('%s/dataset_requirements.json' % monitor_dir,
         'w').write(json.dumps(needs, indent=2))
    for prio in needs_by_priority:
        needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    open('%s/dataset_priorities.json' % monitor_dir,
         'w').write(json.dumps(needs_by_priority, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()

    ## first check if anything is inactive
    all_actives = set([
        transfer.phedexid for transfer in session.query(TransferImp).filter(
            TransferImp.active).all()
    ])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip = False
                sendLog(
                    'stagor', "\t%s is staging for %s" %
                    (imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor', "setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(
        set([
            transfer.phedexid for transfer in session.query(
                TransferImp).filter(TransferImp.active).all()
        ]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor', "read %s from cache" % phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor',
                    'Performing spurious transfer check on %s' % phedexid,
                    level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog(
                    'stagor',
                    'Stagor has got a skewed input from checkTransferStatus',
                    level='critical')
                checks = {}
                pass
            else:
                TS.add(phedexid, checks)

        time_point("Check transfer status %s" % phedexid, sub_lap=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname] = {}
                if not dsname in completion_by_input:
                    completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid] = all(
                    map(lambda i: i >= good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid] = checks[dsname].values()
        if checks:
            sendLog(
                'stagor', "Checks for %s are %s" %
                (phedexid, [node.values() for node in checks.values()]))
            done = all(
                map(
                    lambda i: i >= good_enough,
                    list(
                        itertools.chain.from_iterable(
                            [node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ", phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:  # and tr_wf.status == 'staging':
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {}
                done_by_wf_id[tr_wf.id][phedexid] = done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s, v in checks[ds].items():
                dataset_endpoints[ds].add(s)

        if done:
            sendLog('stagor', "%s is done" % phedexid)
            TS.add(phedexid, checks)
        else:
            sendLog(
                'stagor',
                "%s is not finished %s" % (phedexid, pprint.pformat(checks)))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [
                    SI.SE_to_CE(s) for s, v in checks[ds].items()
                    if v < good_enough
                ]
                sites_incomplete_down = [
                    s for s in sites_incomplete if not s in SI.sites_ready
                ]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]

                if sites_incomplete_down:
                    sendLog(
                        'stagor',
                        "%s are in downtime, while waiting for %s to get there"
                        % (",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update(sites_incomplete_down)
                if sites_incomplete:
                    endpoint_incompleted[ds].update(sites_incomplete)

    time_point("Check on-going transfers")

    print "End points"
    for k in dataset_endpoints:
        dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps(dataset_endpoints, indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime:
        endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps(endpoint_in_downtime, indent=2)

    print "End point incomplete in down time"
    for k in endpoint_incompleted:
        endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps(endpoint_incompleted, indent=2)

    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json' % monitor_dir,
            'w').write(json.dumps(TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json' % monitor_dir,
            'w').write(json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads(
        eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys()
    already_stuck.extend(getAllStuckDataset())

    missing_in_action = defaultdict(list)

    print "-" * 10, "Checking on workflows in staging", "-" * 10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda: defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo, wfi in wfois:
        print "#" * 30
        time_point("Forward checking %s" % wfo.name, sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_, primaries, _, secondaries,
         sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys = {}
        for need in list(primaries) + list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor', "missing transfer report for %s" % need)
                readys[need] = False
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report',
                          '%s does not have a transfer report' % (need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog(
                    'stagor',
                    "assuming it is OK for secondary %s to have no attached transfers"
                    % need)
                readys[need] = True
                done_by_input[need] = {"fake": True}
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor', "%s is ready" % need)
                print json.dumps(done_by_input[need], indent=2)
                readys[need] = True
            else:
                wfi.sendLog(
                    'stagor', "%s is not ready \n%s" %
                    (need, json.dumps(done_by_input[need], indent=2)))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',
                            "all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor', "all needs are fullfilled, already")
                print json.dumps(readys, indent=2)
        else:
            wfi.sendLog('stagor', "missing requirements")
            copies_needed, _ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):  #+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[
                        need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog(
                        'stagor',
                        "%s has only incomplete endpoint in downtime\n%s" %
                        (need, endpoint_in_downtime[need]))
                    re_transfer = True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][
                        se_allowed_key] = getDatasetBlocksFraction(
                            url, need, sites=se_allowed)
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog(
                            'stagor',
                            "assuming it is OK to move on like this already for %s"
                            % need)
                        jump_ahead = True
                    else:
                        wfi.sendLog(
                            'stagor', "Available %s times" %
                            available_cache[need][se_allowed_key])
                        missing_and_downtime = list(
                            set(endpoint_in_downtime[need])
                            & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog(
                                'stagor',
                                "%s is incomplete at %s which is in downtime, trying to move along"
                                % (need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog(
                                'stagor',
                                "continue waiting for transfers for optimum production performance."
                            )

            ## compute a time since staging to filter jump starting ?
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries) + list(secondaries):
                if need in already_stuck:
                    wfi.sendLog('stagor',
                                "%s is stuck, so try to jump ahead" % need)
                    jump_ahead = True

            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead" % wfo.name
                details_text += '\n%s wants %s copies' % (wfo.name,
                                                          copies_needed)
                copies_needed = max(1, copies_needed - 1)
                details_text += '\nlowering by one unit to %s' % copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True

                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][
                            se_allowed_key] = getDatasetPresence(
                                url, need, within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update(presence.keys())
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog(
                        'stagor', "%s is available %s times (%s), at %s" %
                        (need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog(
                        'stagor', "%s is this much transfered %s" %
                        (need, json.dumps(done_by_input[need], indent=2)))
                    all_check &= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:
                    wfi.sendLog(
                        'stagor',
                        "needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name, "has to wait a bit more"
                    wfi.sendLog('stagor', "needs to wait a bit more")
            else:
                wfi.sendLog('stagor', "not checking availability")

            if re_transfer:
                wfi.sendLog(
                    'stagor',
                    "Sending back to considered because of endpoint in downtime"
                )
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add(wfo.name)

    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor',
                "sending back to considered the following workflows \n%s" %
                ('\n'.join(send_back_to_considered)),
                level='critical')

    print "-" * 10, "Checking on non-available datasets", "-" * 10
    ## now check on those that are not fully available

    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min(available_cache[dsname].values())

    really_stuck_dataset = set()

    for dsname, available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(
                Workflow.name == using_it).first()
            if wf:
                using_wfos.append(wf)

        if not len(done_by_input[dsname]):
            print "For dataset", dsname, "there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending", wf.name, "back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor',
                                "%s was send back and might be trouble" %
                                wf.name,
                                level='critical')
                    else:
                        print "would send", wf.name, "back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog(
                            'stagor',
                            "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."
                            % wf.name,
                            level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete", dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks, lost_files = findLostBlocksFiles(
                url, dsname) if (not dsname.endswith('/RAW')) else ([], [])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss, _, n_missing = getDatasetBlockFraction(
                    dsname, lost_block_names)
                print "We have lost", len(
                    lost_block_names
                ), "blocks", lost_block_names, "for %f%%" % (100. *
                                                             fraction_loss)
                if fraction_loss > 0.05:  ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d blocks, for %d events, %3.2f %% loss'
                        % (dsname, len(lost_block_names), n_missing,
                           100 * fraction_loss),
                        level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog(
                                'stagor',
                                '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'
                                % (wf.name, dsname, len(lost_block_names),
                                   n_missing, 100 * fraction_loss),
                                level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog(
                            'stagor',
                            '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_block_names), n_missing,
                               fraction_loss, '\n'.join(lost_block_names)),
                            level='critical')
                        known_lost_blocks[dsname] = [
                            i['name'] for i in lost_blocks
                        ]
                really_stuck_dataset.add(dsname)

            if lost_files:
                fraction_loss, _, n_missing = getDatasetFileFraction(
                    dsname, lost_file_names)
                print "We have lost", len(
                    lost_file_names
                ), "files", lost_file_names, "for %f%%" % fraction_loss

                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d files, for %d events, %f %% loss' %
                        (dsname, len(lost_file_names), n_missing,
                         fraction_loss),
                        level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog(
                            'stagor',
                            '%s is missing %d files, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_file_names), n_missing,
                               fraction_loss, '\n'.join(lost_file_names)),
                            level='critical')
                        known_lost_files[dsname] = [
                            i['name'] for i in lost_files
                        ]

                ## should the status be change to held-staging and pending on a ticket

            missings = [
                pid for (pid, d) in done_by_input[dsname].items() if d == False
            ]
            print "\t", done_by_input[dsname]
            print "\tneeds", len(done_by_input[dsname])
            print "\tgot", done_by_input[dsname].values().count(True)
            print "\tmissing", missings
            missing_in_action[dsname].extend(missings)

    rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_blocks, indent=2))
    rr.close()

    rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_files, indent=2))
    rr.close()

    eosFile('%s/incomplete_transfers.json' % monitor_dir,
            'w').write(json.dumps(missing_in_action, indent=2)).close()
    print "Stuck transfers and datasets"
    print json.dumps(missing_in_action, indent=2)

    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add(dataset)

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add(k, list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json' % base_eos_dir,
            'w').write(json.dumps(list(really_stuck_dataset),
                                  indent=2)).close()
    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    #############
    ## not going further for what matters
    #############
    return
Esempio n. 17
0
def batchor(url):
    UC = unifiedConfiguration()
    SI = global_SI()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend(
            getWorkflows(url,
                         'assignment-approved',
                         details=True,
                         user=user,
                         rtype='TaskChain'))

    wfs = filter(
        lambda r: r['SubRequestType'] == 'RelVal'
        if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter(
        lambda r: r['SubRequestType'] == 'HIRelVal'
        if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:", wf['RequestName'], wf['Campaign']
        by_campaign[wf['Campaign']].add(wf['PrepID'])

    for wf in hi_wfs:
        print "HI Relval:", wf['RequestName'], wf['Campaign']
        by_hi_campaign[wf['Campaign']].add(wf['PrepID'])

    default_setup = {
        "go": True,
        "parameters": {
            "SiteWhitelist": ["T1_US_FNAL"],
            "MergedLFNBase": "/store/relval",
            "Team": "relval",
            "NonCustodialGroup": "RelVal"
        },
        "custodial_override": "notape",
        "phedex_group": "RelVal",
        "lumisize": -1,
        "fractionpass": 0.0,
        "maxcopies": 1
    }
    default_hi_setup = copy.deepcopy(default_setup)

    add_on = {}
    batches = json.loads(eosRead('%s/batches.json' % base_eos_dir))
    relval_routing = UC.get('relval_routing')

    def pick_one_site(p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(
                p["parameters"]["SiteWhitelist"]) > 1:
            choose_from = list(
                set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice(choose_from)
            print "picked", picked, "from", choose_from
            p["parameters"]["SiteWhitelist"] = [picked]

    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_setup)

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword", key
                print "with", augment_with
                setup = deep_update(setup, augment_with)

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(
            set(
                list(copy.deepcopy(by_campaign[campaign])) +
                batches[campaign]))

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_hi_setup)
        possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"])
        hi_site = random.choice(list(possible_sites))
        setup["parameters"]["SiteWhitelist"] = [hi_site]

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the HI relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(
            set(
                list(copy.deepcopy(by_hi_campaign[campaign])) +
                batches[campaign]))

    eosFile('%s/batches.json' % base_eos_dir,
            'w').write(json.dumps(batches, indent=2)).close()

    ## open the campaign configuration
    campaigns = json.loads(eosRead('%s/campaigns.relval.json' % base_eos_dir))

    ## protect for overwriting ??
    for new_campaign in list(set(add_on.keys()) - set(campaigns.keys())):
        ## this is new, and can be announced as such
        print new_campaign, "is new stuff"
        subject = "Request of RelVal samples batch %s" % new_campaign
        text = """Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message""" % (
            new_campaign,
            new_campaign,
        )

        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor', text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in campaigns.keys():
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        if not all_in_batch: continue
        is_batch_done = all(
            map(
                lambda s: not s in [
                    'completed', 'force-complete', 'running-open',
                    'running-closed', 'acquired', 'assigned',
                    'assignment-approved'
                ], [wf['RequestStatus'] for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            campaigns.pop(old_campaign)  ## or just drop it all together ?
            print "batch", old_campaign, " configuration was removed"

    ## merge all anyways
    campaigns.update(add_on)

    ## write it out for posterity
    open('campaigns.json.updated', 'w').write(json.dumps(campaigns, indent=2))

    ## read back
    rread = json.loads(open('campaigns.json.updated').read())

    os.system('cp campaigns.json.updated %s/campaigns.relval.json' %
              monitor_dir)
    os.system('cp campaigns.json.updated %s/campaigns.relval.json' %
              base_eos_dir)