Python changePriorityWorkflow Examples

Programming Language: Python

Namespace/Package Name: reqMgrClient

Method/Function: changePriorityWorkflow

Examples at hotexamples.com: 6

Python changePriorityWorkflow - 6 examples found. These are the top rated real world Python examples of reqMgrClient.changePriorityWorkflow extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: changePriorityWorkflow.py Project: AndrewLevin/WmAgentScripts

def main():
    parser = optparse.OptionParser("Usage %prog [WF1 WF2 ... | -f FILE] PRIO")
    parser.add_option('-f', '--file', help='Text file',
                      dest='file'); options, args = parser.parse_args()

    if options.file:
        wfs = [l.strip() for l in open(options.file).readlines() if l.strip()]
    elif len(args) >= 2:
        # get workflow and priority
        wfs = args[:-1]
    else:
        parser.error("Provide workflow names and priority")
        sys.exit(0)
        
    priority = args[-1]
    # repeat for everyone
    for wf in wfs:
        reqMgrClient.changePriorityWorkflow(url, wf, priority)

Example #2

Show file

def main():
    parser = optparse.OptionParser("Usage %prog [WF1 WF2 ... | -f FILE] PRIO")
    parser.add_option('-f', '--file', help='Text file', dest='file')
    options, args = parser.parse_args()

    if options.file:
        wfs = [l.strip() for l in open(options.file).readlines() if l.strip()]
    elif len(args) >= 2:
        # get workflow and priority
        wfs = args[:-1]
    else:
        parser.error("Provide workflow names and priority")
        sys.exit(0)

    priority = args[-1]
    # repeat for everyone
    for wf in wfs:
        reqMgrClient.changePriorityWorkflow(url, wf, priority)

Example #3

Show file

def singleRecovery(url,
                   task,
                   initial,
                   actions,
                   do=False,
                   priority_change=False):
    print "Inside single recovery!"
    payload = {
        "Requestor": os.getenv('USER'),
        "RequestType": "Resubmission",
        "ACDCServer": initial['ConfigCacheUrl'],
        "ACDCDatabase": "acdcserver",
        "OriginalRequestName": initial['RequestName'],
        "OpenRunningTimeout": 0
    }
    copy_over = ['RequestPriority', 'RequestString', 'CMSSWVersion']
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c, "not in the initial payload"

    #a massage ? boost the recovery over the initial wf
#    payload['RequestPriority'] *= 10
#Max priority is 1M

    original_priority = payload['RequestPriority']
    payload['RequestPriority'] = min(500000, payload['RequestPriority'] *
                                     2)  ## never above 500k

    if priority_change:

        failjobs = getFailedJobs(task)
        if failjobs and failjobs > 500:
            payload['RequestPriority'] = min(500000, original_priority * 1.2)

    #change parameters based on actions here
    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[
                    action] != 'Same':
                #if multicore parameter is also used, need to scale memory by the new number of cores
                if 'multicore' in actions and actions['multicore'] != "":
                    continue
                ## Taskchains needs to be treated special to set the memory to all tasks
                try:
                    set_to = int(actions[action])
                except ValueError:
                    if actions[action].lower().endswith('gb'):
                        set_to = int(actions[action][:-2]) * 1024
                    elif actions[action].lower().endswith('kb'):
                        set_to = int(actions[action][:-2])
                    else:
                        # invalid parameter -- making no change
                        continue
                if 'TaskChain' in initial:
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            payload[t]['Memory'] = set_to
                        else:
                            break
                    payload['Memory'] = set_to
                    print
                    "Memory set to: ", set_to
                else:
                    payload['Memory'] = set_to
                    print "Memory set to: ", set_to

            if action.startswith('multicore') and actions[action] != "":
                set_to = int(actions[action])
                ## Taskchains needs to be treated special to set the multicore and memory values to all tasks
                if 'TaskChain' in initial:
                    mem_dict = payload['Memory'] if type(
                        payload['Memory']) == dict else {}
                    core_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            mem = mem_dict.setdefault(tname,
                                                      payload[t]['Memory'])

                            #Need to scale the memory by the new number of cores
                            initial_cores = payload[t].setdefault(
                                'Multicore', 1)

                            if 'memory' in actions and actions[
                                    'memory'] != "" and actions[
                                        'memory'] != 'Same':
                                mem = actions['memory']

                            fraction_constant = 0.4
                            mem_per_core_c = int((1 - fraction_constant) *
                                                 mem / float(initial_cores))

                            mem_dict[tname] = int(mem +
                                                  (set_to - initial_cores) *
                                                  mem_per_core_c)
                            core_dict[tname] = set_to

                            print "For ", t
                            print "Multicore set to ", set_to
                            print "Memory set to ", mem_dict[tname]
                        else:
                            break
                    payload['Memory'] = mem_dict
                    payload['Multicore'] = core_dict

                else:
                    #Need to scale the memory by the new number of cores
                    initial_cores = initial.setdefault('Multicore', 1)

                    mem = payload['Memory']
                    if 'memory' in actions and actions[
                            'memory'] != "" and actions['memory'] != 'Same':
                        mem = actions['memory']

                    fraction_constant = 0.4
                    mem_per_core_c = int(
                        (1 - fraction_constant) * mem / float(initial_cores))

                    payload['Multicore'] = set_to
                    payload['Memory'] = int(mem + (set_to - initial_cores) *
                                            mem_per_core_c)

                    print "Multicore set to ", set_to
                    print "Memory set to ", payload['Memory']

            if action.startswith('split'):

                split_alert = (initial['RequestType'] in ['MonteCarlo'])
                for key in initial:
                    if key == 'SplittingAlgo' and (initial[key]
                                                   in ['EventBased']):
                        split_alert = True
                    elif key.startswith('Task') and key != 'TaskChain':
                        for key2 in initial[key]:
                            if key2 == 'TaskName':
                                this_taskname = initial[key][key2]
                                recover_task = task.split('/')[-1]
                                print "For recovery of task", recover_task
                                print "Looking at task", this_taskname
                                if (recover_task == this_taskname) and (
                                        initial[key]['SplittingAlgo']
                                        in ['EventBased']):
                                    ## the task to be recovered is actually of the wrong type to allow change of splitting
                                    sendLog(
                                        'actor',
                                        'To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.'
                                        %
                                        (task, initial[key]['SplittingAlgo']),
                                        level='critical')
                                    ## do not send an alert and stop the acdc
                                    #split_alert = True

                if split_alert:
                    sendLog('actor',
                            'Cannot change splitting for %s' %
                            initial['RequestName'],
                            level='critical')
                    print "I should not be doing splitting for this type of request", initial[
                        'RequestName']
                    return None

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1

    initial_string = initial_string.replace('ACDC_', '').replace(
        'ACDC%d_' % (acdc_round - 1), '')
    payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string)
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps(payload, indent=2)
        return None

    print "ACDC payload"
    print json.dumps(payload, indent=2)
    print actions

    ## submit here
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for", initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for", initial["RequestName"]
            sendLog('actor',
                    'Failed twice in making ACDCs for %s!' %
                    initial['RequestName'],
                    level='critical')
            return None

    ## change splitting if requested
    if actions:
        for action in actions:
            if action.startswith('split'):
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[action] != 'max':
                    factor = int(
                        actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        if split['splitAlgo'] in ['EventBased']:
                            sendLog(
                                'actor',
                                "Changing the splitting on %s for %s is not permitted. Not changing."
                                % (split['splitAlgo'], initial["RequestName"]),
                                level='critical')
                            continue

                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                print "Changing %s (%d) by a factor %d" % (
                                    act, split_par[act], factor),
                                split_par[act] /= factor
                                split_par[act] = max(split_par[act], 1)
                                print "to", split_par[act]
                                break
                        #split['requestName'] = acdc
                        #print "changing the splitting of",acdc
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, acdc, split )

#consider the splitting
                    if priority_change and failjobs and failjobs < 500 and failjobs * factor > 500:
                        print "splitting causes jobs passing threshold"
                        new_priority = min(500000, original_priority * 1.2)
                        split_change = reqMgrClient.changePriorityWorkflow(
                            url, acdc, new_priority)

                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                print "Changing %s (%d) " % (act,
                                                             split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ", split_par[act]
                                break
                    if priority_change:
                        new_priority = min(500000, original_priority * 1.2)
                        split_change = reqMgrClient.changePriorityWorkflow(
                            url, acdc, new_priority)

                print "changing the splitting of", acdc
                print json.dumps(splittings, indent=2)
                done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings)
                ## check on done == True

    data = reqMgrClient.setWorkflowApproved(url, acdc)

    print data
    return acdc

Example #4

Show file

File: completor.py Project: CMSCompOps/WmAgentScripts

def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return 


    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira',False) else None

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]
        

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir)))
            break
        except:
            time.sleep(2)
        
    for itry in range(5):
         try:
             ## take into account the block that needed to be repositioned recently
             all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] )
             break
         except:
             time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = { 110000 : 21,
                                 90000 : 28,
                            #     80000 : 60,
                            #0 : 90
                             }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user,spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in ['force-complete', 'completed']:
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue



        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task,outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask( task )
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60*60*24.)

        priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[]))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.)
            priority_delay = now - then ## in days
            print "priority was set to",priority,priority_delay,"[days] ago"

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.)
            delay = now - then ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay))
        if priority_delay!=0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url, original.request['OriginalRequestName'])
        injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.)
            injection_delay = now - injected_on
        

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority
        
        pop_a_jira = False
        ping_on_jira = 7 *(24*60*60) # 7 days
        for jp,jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'],
                                                    priority = wfi.request['RequestPriority'],
                                                    label = 'Late',
                                                    reopen = True)
            last_time = JC.last_time( j )
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d"%( delay, priority)
                JC.comment(j.key, j_comment)
            

        if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000 ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized
            tail_cutting_priority += 101 ## to signal it is from this mechanism
            tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority
            tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value
            
            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)
                    max_priority-=1
                else:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c,14) for c in campaigns])
            
        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay: 
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = wfi.getCompletionFraction(caller='completor')
        
        if not percent_completions:
            sendLog('completor','%s has no output at all'% wfo.name, level='critical')
            continue

        is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay)
        is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions]))

        if is_over_completion:
            wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ),
                                                                       json.dumps( percent_completions, indent=2 )
                                                                       ))
        elif is_over_allowed_delay:
            wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_over_truncation_delay:
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ),
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), 
                                                                                                 json.dumps(good_fraction_per_out, indent=2),
                                                                                                 json.dumps( truncate_fraction_per_out, indent=2),
                                                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then
            

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force>0:
            print "going for force-complete of",wfo.name
            if not safe_mode:
                forceComplete(url, wfi )
                set_force_complete.add( wfo.name )
                wfi.sendLog('completor','going for force completing')
                wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
                max_force -=1
            else:
                sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name)
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close()

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text

Example #5

Show file

def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira', False) else None

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    max_per_round = UC.get('max_per_round').get('completor', None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update(
                json.loads(eosRead('%s/stuck_transfers.json' %
                                   monitor_pub_dir)))
            break
        except:
            time.sleep(2)

    for itry in range(5):
        try:
            ## take into account the block that needed to be repositioned recently
            all_stuck.update([
                b.split('#')[0] for b in json.loads(
                    eosRead('%s/missing_blocks.json' % monitor_dir))
            ])
            break
        except:
            time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update(getAllStuckDataset())

    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {}
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print "can truncate complete on"
    print json.dumps(truncate_fractions, indent=2)
    print "can overide on"
    print json.dumps(overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = {
        110000: 21,
        90000: 28,
        #     80000 : 60,
        #0 : 90
    }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip = False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user, spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in [
                    'force-complete', 'completed'
            ]:
                if any(s in wfo.name
                       for s in spec) or (wfo.name in spec) or any(
                           pid in spec for pid in pids) or any(s in pids
                                                               for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url, wfi)
                    skip = True
                    wfi.notifyRequestor(
                        "The workflow %s was force completed by request of %s"
                        % (wfo.name, user),
                        do_batch=False)
                    wfi.sendLog(
                        'completor',
                        '%s is asking for %s to be force complete' %
                        (user, wfo.name))
                    break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask(
        )  ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task, outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask(task)
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(
                    task_campaign, 1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(
                    task_campaign, default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(
                    task_campaign, 1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        priority_log = filter(lambda change: change['Priority'] == priority,
                              wfi.request.get('PriorityTransition', []))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in priority_log]) / (60. * 60. * 24.)
            priority_delay = now - then  ## in days
            print "priority was set to", priority, priority_delay, "[days] ago"

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in running_log]) / (60. * 60. * 24.)
            delay = now - then  ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog(
            'completor',
            "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"
            % (cpuh, delay, priority, priority_delay))
        if priority_delay != 0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url,
                                    original.request['OriginalRequestName'])
        injected_log = filter(
            lambda change: change["Status"] in ["assignment-approved"],
            original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.)
            injection_delay = now - injected_on

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority

        pop_a_jira = False
        ping_on_jira = 7 * (24 * 60 * 60)  # 7 days
        for jp, jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j, reopened, just_created = JC.create_or_last(
                prepid=wfi.request['PrepID'],
                priority=wfi.request['RequestPriority'],
                label='Late',
                reopen=True)
            last_time = JC.last_time(j)
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d" % (
                    delay, priority)
                JC.comment(j.key, j_comment)

        if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000  ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority'] + int(
                (delay_priority_increase *
                 (delay_for_priority_increase - injection_delay_threshold) / 7)
                / quantized) * quantized
            tail_cutting_priority += 101  ## to signal it is from this mechanism
            tail_cutting_priority = min(
                400000, tail_cutting_priority)  ## never go above 400k priority
            tail_cutting_priority = max(
                tail_cutting_priority,
                priority)  ## never go below the current value

            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, increasing to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'bumping priority to %d for being injected since %s' %
                        (tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(
                        url, wfo.name, tail_cutting_priority)
                    max_priority -= 1
                else:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, would like to increase to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'would like to bump priority to %d for being injected since %s'
                        % (tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _, prim, _, _ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck:
            wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c, 14) for c in campaigns])

        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay:
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = {
            "delay": delay,
            "injection_delay": injection_delay
        }

        percent_completions = wfi.getCompletionFraction(caller='completor')

        if not percent_completions:
            sendLog('completor',
                    '%s has no output at all' % wfo.name,
                    level='critical')
            continue

        is_over_allowed_delay = (all([
            percent_completions[out] >= good_fraction_per_out.get(out, 1000.)
            for out in percent_completions
        ]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([
            percent_completions[out] >= truncate_fraction_per_out.get(
                out, 1000.) for out in percent_completions
        ])) and delay >= allowed_delay)
        is_over_completion = (all([
            percent_completions[out] >= good_fraction_nodelay_per_out.get(
                out, 1000.) for out in percent_completions
        ]))

        if is_over_completion:
            wfi.sendLog(
                'completor', "all is over completed %s\n %s" %
                (json.dumps(good_fraction_nodelay_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_allowed_delay:
            wfi.sendLog(
                'completor', "all is above %s \n%s" %
                (json.dumps(good_fraction_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_truncation_delay:
            wfi.sendLog(
                'completor',
                "all is above %s truncation level, and the input is stuck\n%s"
                % (json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(percent_completions, indent=2)))

        else:
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog(
                'completor',
                "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"
                % (json.dumps(percent_completions, indent=2),
                   json.dumps(good_fraction_per_out, indent=2),
                   json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(long_lasting[wfo.name]['agents'], indent=2)))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then

        ran_at = wfi.request['SiteWhitelist']

        wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay))

        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force > 0:
            print "going for force-complete of", wfo.name
            if not safe_mode:
                forceComplete(url, wfi)
                set_force_complete.add(wfo.name)
                wfi.sendLog('completor', 'going for force completing')
                wfi.notifyRequestor(
                    "The workflow %s was force completed for running too long"
                    % wfo.name)
                max_force -= 1
            else:
                sendEmail(
                    'completor',
                    'The workflow %s is ready for force complete, but completor is in safe mode'
                    % wfo.name)
        else:
            wfi.sendLog(
                'completor',
                "too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog(
            'completor', 'The followings were set force-complete \n%s' %
            ('\n'.join(set_force_complete)))

    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text = "These have been running for long"

    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json' % monitor_dir,
            'w').write(json.dumps(long_lasting, indent=2)).close()

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    print text

Example #6

Show file

def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ))
    ## take into account the block that needed to be repositioned recently
    all_stuck.update( [b.split('#')[0] for b in json.loads( open('%s/missing_blocks.json'%monitor_dir).read()) ] )
    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()


    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]) and not any([c in wfo.name for c in truncate_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue


        if wfi.request['RequestStatus'] in ['running-open','running-closed'] and (random.random() < 0.01):
            try:
                parse_one(url, wfo.name )
            except Exception as e:
                print "failed error parsing"
                print str(e)

        c = wfi.request['Campaign']
        #if not c in good_fractions: continue

        good_fraction = good_fractions.get(c,1000.)
        truncate_fraction = truncate_fractions.get(c,1000.)

        print "force at",good_fraction,"truncate at",truncate_fraction
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        ## this is supposed to be the very initial request date, inherited from clones
        try:
            injected_on =time.mktime(time.strptime("-".join(map(lambda n : "%02d"%n, wfi.request['RequestDate'])), "%Y-%m-%d-%H-%M-%S")) / (60.*60.*24.)
            injection_delay = now - injected_on
        except:
            injection_delay = None

        
        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        if injection_delay!=None and injection_delay > 50 and False:
            tail_cutting_priority = 200000
            ## bump up to 200k
            print "Injected since",injection_delay,"[days] priority=",priority
            if wfi.request['RequestPriority'] != tail_cutting_priority:
                wfi.sendLog('completor','bumping priority to %d for being injected since'% tail_cutting_priority)
                reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = {}
        
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            ## taking the max or not : to force-complete on over-pro
            percent_completions[output] = min( lumi_completion, event_completion )
            
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_stuck and (all([percent_completions[out] >= truncate_fraction for out in percent_completions])):
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( truncate_fraction,
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

            sendEmail('workflow for truncation','check on %s'%wfo.name)
            ##continue ## because you do not want to get this one right now
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %.3f (complete) %.3f truncate \n%s"%(percent_completions.values(), 
                                                                                             good_fraction, truncate_fraction,
                                                                                             json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    

        if delay >= 40:
            print wfo.name,"has been running for",dealy,"days"
            ## bumping up the priority?
            


    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text