Python getDatasetEventsAndLumis Examples

Programming Language: Python

Namespace/Package Name: utils

Method/Function: getDatasetEventsAndLumis

Examples at hotexamples.com: 20

Python getDatasetEventsAndLumis - 20 examples found. These are the top rated real world Python examples of utils.getDatasetEventsAndLumis extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: closor.py Project: areinsvo/WmAgentScripts

def closor(url, specific=None):
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #LI = lockInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')
    ## manually closed-out workflows should get to close with checkor
    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    held = set()

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wfi = workflowInfo(url, wfo.name )
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.request['RequestStatus'] in  ['announced','normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status))
        session.commit()


        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda : False)
        #print outputs
        if len(outputs): 
            print wfo.name,wfi.request['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.))
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            #fraction = wfo.fraction_for_closing
            #fraction = 0.0
            #all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))
            all_OK[out] = True 


        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence( url, out )
            where = [site for site,info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out,"is in full at",",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
                wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
                at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
                else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
                print json.dumps( at_destination )
                print json.dumps( else_where, indent=2 )
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to = there )
                    for l in late_info:
                        l.update({"workflow":wfo.name,"dataset":out})
                    all_late_files.extend( late_info )
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

    
        ## verify if we have to do harvesting

        (OK, requests) = spawn_harvesting(url, wfi, in_full)
        all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']:
            print wfo.name,"to be announced"
            results=[]#'dummy']
            if not results:
                for out in outputs:
                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request['Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True
                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"):
                            print "tier",tier,"neither TO or NO DDM for",out
                            results.append('Not recognitized tier %s'%tier)
                            sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            continue

                        n_copies = 2
                        destinations=[]
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]:
                            ddm_instructions = CI.campaigns[campaign]['DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier,indication in ddm_instructions.items():
                                    if ddmtier==tier or ddmtier in ['*','all']:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']
                                            
                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination="+",".join( destinations )
                        ## inject to DDM when necessary
                        if to_DDM:
                            #print "Sending",out," to DDM"
                            p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec))
                            print p.read()
                            status = p.close()
                            if status!=None:
                                print "Failed DDM, retrying a second time"
                                p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec))
                                print p.read()
                                status = p.close()    
                                if status!=None:
                                    sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                            results.append( status )
                            if status == None:
                                wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( n_copies, out,destination_spec))
                                                            
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    ## only announce if all previous are fine
                    res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                    if not res in ['None',None]:
                        ## check the status again, it might well have toggled
                        wl_bis = workflowInfo(url, wfo.name)
                        wfo.wm_status = wl_bis.request['RequestStatus']
                        session.commit()
                        if wl_bis.request['RequestStatus'] in  ['announced','normal-archived']:
                            res = None
                        else:
                            ## retry ?
                            res = reqMgrClient.announceWorkflowCascade(url, wfo.name) 
                            
                    results.append( res )
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor',"workflow is announced")
            else:
                print "ERROR with ",wfo.name,"to be announced",json.dumps( results )
                
        else:
            print wfo.name,"not good for announcing:",wfi.request['RequestStatus']
            wfi.sendLog('closor',"cannot be announced")
            held.add( wfo.name )

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        sendEmail('waiting for files to announce', subject)
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))

Example #2

Show file

def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):
                    sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name))
                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    if user == 'mcm' and use_mcm:
                        for pid in wfi.getPrepIDs():
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                    #sendEmail('completor test','not force completing automatically, you have to go back to it')
                    #skip=False
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay }

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            percent_completions[output] = min( lumi_completion, event_completion )
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction,
                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    
    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
        #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )

    #if wfs_no_location_in_GQ:
    #    sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text

Example #3

Show file

File: completor.py Project: bbockelm/WmAgentScripts

def completor(url, specific):

    CI = campaignInfo()

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
    ##wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order
    random.shuffle(wfs)

    ## by workflow a list of fraction / timestamps
    completions = json.loads(open("completions.json").read())

    good_fractions = {}
    for c in CI.campaigns:
        if "force-complete" in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]["force-complete"]

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    for wfo in wfs:
        if specific and not specific in wfo.name:
            continue

        if not any([c in wfo.name for c in good_fractions]):
            continue

        print "looking at", wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)

        if not "Campaign" in wfi.request:
            continue
        c = wfi.request["Campaign"]
        if not c in good_fractions:
            continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.0

        lumi_expected = None
        event_expected = None
        if not "TotalInputEvents" in wfi.request:
            if "RequestNumEvents" in wfi.request:
                event_expected = wfi.request["RequestNumEvents"]
            else:
                continue
        else:
            lumi_expected = wfi.request["TotalInputLumis"]
            event_expected = wfi.request["TotalInputEvents"]

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.0)

        running_log = filter(
            lambda change: change["Status"] in ["running-open", "running-closed"], wfi.request["RequestTransition"]
        )
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]["UpdateTime"] / (60.0 * 60.0 * 24.0)
        delay = now - then  ## in days

        (w, d) = divmod(delay, 7)
        print "\t" * int(w) + "Running since", delay, "[days]"
        if delay <= 4:
            continue
        if delay >= 7:
            sendEmail("long lasting workflow", "%s has been running for %s days" % (wfo.name, delay))

        percent_completions = {}
        for output in wfi.request["OutputDatasets"]:
            if not output in completions:
                completions[output] = {"injected": None, "checkpoints": [], "workflow": wfo.name}
            ## get completion fraction
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion = 0.0
            event_completion = 0.0
            if lumi_expected:
                lumi_completion = lumi_count / float(lumi_expected)
            if event_expected:
                event_completion = event_count / float(event_expected)

            # take the less optimistic
            percent_completions[output] = min(lumi_completion, event_completion)
            completions[output]["checkpoints"].append((now, event_completion))

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            print "all is above", good_fraction, "for", wfo.name
            print json.dumps(percent_completions, indent=2)
        else:
            print "\t", percent_completions.values(), "not over bound", good_fraction
            # print json.dumps( percent_completions, indent=2 )
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in percent_completions:
            completions[output]["injected"] = then

        # further check on delays
        cpuh = wfi.getComputingTime(unit="d")

        ran_at = wfi.request["SiteWhitelist"]
        print "Required:", cpuh,
        print "Time spend:", delay

        ## find ACDCs that might be running
        familly = getWorkflowById(url, wfi.request["PrepID"], details=True)
        for member in familly:
            ### if member['RequestName'] == wl['RequestName']: continue ## set himself out
            if member["RequestDate"] < wfi.request["RequestDate"]:
                continue
            if member["RequestStatus"] in ["None", None]:
                continue
            ## then set force complete all members
            if member["RequestStatus"] in ["running-opened", "running-closed"]:
                print "setting", member["RequestName"], "force-complete"
                print "NOT REALLY FORCING"
                sendEmail(
                    "force completing",
                    "TAGGING %s is worth force completing\n%s" % (member["RequestName"], percent_completions),
                )
                ##reqMgrClient.setWorkflowForceComplete(url, member['RequestName'])

        ## do it once only for testing
        # break

    open("completions.json", "w").write(json.dumps(completions, indent=2))

Example #4

Show file

File: closor.py Project: julianbadillo/WmAgentScripts

def closor(url, specific=None):
    if not componentInfo().check(): return

    CI = campaignInfo()
    LI = lockInfo()

    ## manually closed-out workflows should get to close with checkor
    for wfo in session.query(Workflow).filter(Workflow.status=='close').all():

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wl = getWorkLoad(url, wfo.name)
        wfo.wm_status = wl['RequestStatus']

        if wl['RequestStatus'] in  ['announced','normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wl['RequestStatus']
            print wfo.name,"is announced already",wfo.wm_status

        session.commit()


        expected_lumis = 1
        if not 'TotalInputLumis' in wl:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        else:
            expected_lumis = wl['TotalInputLumis']

        ## what are the outputs
        outputs = wl['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = []
        #print outputs
        if len(outputs): 
            print wfo.name,wl['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            fraction = wfo.fraction_for_closing
            fraction = 0.0
            all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))


        ## only that status can let me go into announced
        if wl['RequestStatus'] in ['closed-out']:
            print wfo.name,"to be announced"

            results=[]#'dummy']
            if not results:
                for (io,out) in enumerate(outputs):
                    if all_OK[io]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier))
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wl and wl['Campaign']:
                                campaign = wl['Campaign']
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True
                            
                        ## inject to DDM when necessary
                        passed_to_DDM=True
                        if to_DDM:
                            #print "Sending",out," to DDM"
                            status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec'])
                            if status!=0:
                                print "Failed DDM, retrying a second time"
                                status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec'])
                                if status!=0:
                                    results.append("Failed DDM for %s"% out)
                                    sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    passed_to_DDM=False
                            if passed_to_DDM:
                                ## make a lock release
                                LI.release_everywhere( out, reason = 'global unlock after passing to DDM')                                
                                pass

                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    ## only announce if all previous are fine
                    results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name))
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                wfo.status = 'done'
                session.commit()
                print wfo.name,"is announced"
            else:
                print "ERROR with ",wfo.name,"to be announced",json.dumps( results )
        else:
            print wfo.name,"not good for announcing:",wl['RequestStatus']

Example #5

Show file

def checkor(url, spec=None, options=None):
    fDB = falseDB()

    wfs=[]
    if options.fetch:
        #workflows = getWorkflows(url, status='completed')
        #for wf in workflows:
        #    wfo = session.query(Workflow).filter(Workflow.name == wf ).first()
        #    if wfo:
        #        if not wfo.status in ['away','assistance']: continue
        #        wfs.append(wfo )
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )
    else:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            print wfo.name,"is already",wfo.wm_status
            wfo.status = 'close'
            session.commit()
            continue
        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']:
            ## went into trouble
            wfo.status = 'trouble'
            print wfo.name,"is in trouble",wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            print wfo.name,"not running yet"
            session.commit()
            continue
        
        
        if wfo.wm_status != 'completed':
            ## for sure move on with closeout check if in completed
            print "no need to check on",wfo.name,"in status",wfo.wm_status
            session.commit()
            continue

        session.commit()        
        sub_assistance="" # if that string is filled, there will be need for manual assistance

        is_closing = True
        ## do the closed-out checks one by one

        # tuck out DQMIO/DQM
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        for member in familly:
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                #print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False

        ## completion check
        percent_completions = {}
        event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        fractions_pass = {}
        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.
            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                print "overriding fraction to",fractions_pass[output],"for",output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

        if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            sub_assistance+='-recovery'
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 300.
            campaign = get_campaign(output, wfi)
            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign
            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
            lumi_upper_limit[output] = upper_limit
        
        if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance+='-biglumi'
            is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        vetoed_custodial_tier = ['MINIAODSIM']
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"
                        break
            ## get from the parent
            if not custodial and 'InputDataset' in wfi.request:
                parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort"
                    continue

            if not custodial:
                ## pick one at random
                custodial = SI.pick_SE()

            if custodial and not sub_assistance and not acdc:
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        custodials[custodial].append( output )
            else:
                print "cannot find a custodial for",wfo.name
            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance+="-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            for output in wfi.request['OutputDatasets']:
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output )
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output )
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                sub_assistance+='-duplicates'
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = len(acdc)

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                # set it from away/assistance* to close
                wfo.status = 'close'
                session.commit()
        else:
            print wfo.name,"needs assistance"
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            wfo.status = 'assistance'+sub_assistance
            if not options.test:
                print "setting",wfo.name,"to",wfo.status
                session.commit()

    fDB.summary()
    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low')
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations

Example #6

Show file

File: completor.py Project: AndrewLevin/WmAgentScripts

def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()


    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):
                    sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name))
                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay }

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            percent_completions[output] = min( lumi_completion, event_completion )
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction,
                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    
    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
        #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )

    #if wfs_no_location_in_GQ:
    #    sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text

Example #7

Show file

def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ))
    ## take into account the block that needed to be repositioned recently
    all_stuck.update( [b.split('#')[0] for b in json.loads( open('%s/missing_blocks.json'%monitor_dir).read()) ] )
    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()


    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]) and not any([c in wfo.name for c in truncate_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue


        if wfi.request['RequestStatus'] in ['running-open','running-closed'] and (random.random() < 0.01):
            try:
                parse_one(url, wfo.name )
            except Exception as e:
                print "failed error parsing"
                print str(e)

        c = wfi.request['Campaign']
        #if not c in good_fractions: continue

        good_fraction = good_fractions.get(c,1000.)
        truncate_fraction = truncate_fractions.get(c,1000.)

        print "force at",good_fraction,"truncate at",truncate_fraction
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        ## this is supposed to be the very initial request date, inherited from clones
        try:
            injected_on =time.mktime(time.strptime("-".join(map(lambda n : "%02d"%n, wfi.request['RequestDate'])), "%Y-%m-%d-%H-%M-%S")) / (60.*60.*24.)
            injection_delay = now - injected_on
        except:
            injection_delay = None

        
        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        if injection_delay!=None and injection_delay > 50 and False:
            tail_cutting_priority = 200000
            ## bump up to 200k
            print "Injected since",injection_delay,"[days] priority=",priority
            if wfi.request['RequestPriority'] != tail_cutting_priority:
                wfi.sendLog('completor','bumping priority to %d for being injected since'% tail_cutting_priority)
                reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = {}
        
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            ## taking the max or not : to force-complete on over-pro
            percent_completions[output] = min( lumi_completion, event_completion )
            
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_stuck and (all([percent_completions[out] >= truncate_fraction for out in percent_completions])):
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( truncate_fraction,
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

            sendEmail('workflow for truncation','check on %s'%wfo.name)
            ##continue ## because you do not want to get this one right now
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %.3f (complete) %.3f truncate \n%s"%(percent_completions.values(), 
                                                                                             good_fraction, truncate_fraction,
                                                                                             json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    

        if delay >= 40:
            print wfo.name,"has been running for",dealy,"days"
            ## bumping up the priority?
            


    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text

Example #8

Show file

File: checkor.py Project: bbockelm/WmAgentScripts

def checkor(url, spec=None, options=None):
    fDB = closeoutInfo()
    if userLock():
        return
    if duplicateLock():
        return

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=["mcm"])
    if not up.check():
        return
    use_mcm = up.status["mcm"]

    wfs = []
    if options.fetch:
        ## get all in running and check
        wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
        wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all())
    if options.nofetch:
        ## than get all in need for assistance
        wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all())

    custodials = defaultdict(list)  # sites : dataset list
    transfers = defaultdict(list)  # sites : dataset list
    invalidations = []  # a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split("/")[2].split("-")[0]
        except:
            if "Campaign" in wfi.request:
                campaign = wfi.request["Campaign"]
        return campaign

    by_passes = []
    holdings = []
    for bypassor, email in [
        ("jbadillo", "*****@*****.**"),
        ("vlimant", "*****@*****.**"),
        ("jen_a", "*****@*****.**"),
    ]:
        bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor)
        if not os.path.isfile(bypass_file):
            print "no file", bypass_file
            continue
        try:
            by_passes.extend(json.loads(open(bypass_file).read()))
        except:
            print "cannot get by-passes from", bypass_file, "for", bypassor
            sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email])

        holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor)
        if not os.path.isfile(holding_file):
            print "no file", holding_file
            continue
        try:
            holdings.extend(json.loads(open(holding_file).read()))
        except:
            print "cannot get holdings from", holding_file, "for", bypassor
            sendEmail(
                "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email]
            )

    total_running_time = 5.0 * 60.0
    sleep_time = max(0.5, total_running_time / len(wfs))

    for wfo in wfs:
        if spec and not (spec in wfo.name):
            continue
        time.sleep(sleep_time)
        print "checking on", wfo.name

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request["RequestStatus"]
        if wfo.wm_status == "closed-out":
            ## manually closed-out
            print wfo.name, "is already", wfo.wm_status
            wfo.status = "close"
            session.commit()
            continue

        elif wfo.wm_status in [
            "failed",
            "aborted",
            "aborted-archived",
            "rejected",
            "rejected-archived",
            "aborted-completed",
        ]:
            ## went into trouble
            wfo.status = "trouble"
            print wfo.name, "is in trouble", wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ["assigned", "acquired"]:
            ## not worth checking yet
            print wfo.name, "not running yet"
            session.commit()
            continue

        if "-onhold" in wfo.status:
            if wfo.name in holdings and wfo.name not in by_passes:
                print wfo.name, "on hold"
                continue

        if wfo.name in holdings and wfo.name not in by_passes:
            wfo.status = "assistance-onhold"
            print "setting", wfo.name, "on hold"
            session.commit()
            continue

        if wfo.wm_status != "completed" and not wfo.name in by_passes:
            ## for sure move on with closeout check if in completed
            print "no need to check on", wfo.name, "in status", wfo.wm_status
            session.commit()
            continue

        session.commit()
        sub_assistance = ""  # if that string is filled, there will be need for manual assistance

        is_closing = True

        ## get it from somewhere
        by_pass_checks = False
        if wfo.name in by_passes:
            print "we can bypass checks on", wfo.name
            by_pass_checks = True
        for bypass in by_passes:
            if bypass in wfo.name:
                print "we can bypass", wfo.name, "because of keyword", bypass
                by_pass_checks = True
                break

        if not CI.go(wfi.request["Campaign"]) and not by_pass_checks:
            print "No go for", wfo.name
            continue

        # tuck out DQMIO/DQM
        wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request["PrepID"], details=True)
        acdc = []
        acdc_inactive = []
        has_recovery_going = False
        had_any_recovery = False
        for member in familly:
            if member["RequestType"] != "Resubmission":
                continue
            if member["RequestName"] == wfo.name:
                continue
            if member["RequestDate"] < wfi.request["RequestDate"]:
                continue
            if member["RequestStatus"] in [
                "running-open",
                "running-closed",
                "assignment-approved",
                "assigned",
                "acquired",
            ]:
                print wfo.name, "still has an ACDC running", member["RequestName"]
                acdc.append(member["RequestName"])
                # print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False
                has_recovery_going = True
            elif member["RequestStatus"] == None:
                print member["RequestName"], "is not real"
                pass
            else:
                acdc_inactive.append(member["RequestName"])
                had_any_recovery = True
        ## completion check
        percent_completions = {}
        #        print "let's see who is crashing", wfo.name
        #        print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        if not "TotalInputEvents" in wfi.request:
            event_expected, lumi_expected = 0, 0
            if not "recovery" in wfo.status:
                sendEmail(
                    "missing member of the request",
                    "TotalInputEvents is missing from the workload of %s" % wfo.name,
                    destination=["*****@*****.**"],
                )
        else:
            event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"]

        if "RequestNumEvents" in wfi.request:
            event_expected = int(wfi.request["RequestNumEvents"])
        elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]:
            event_expected = int(wfi.request["Task1"]["RequestNumEvents"])

        fractions_pass = {}
        for output in wfi.request["OutputDatasets"]:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.0
            if lumi_expected:
                percent_completions[output] = lumi_count / float(lumi_expected)
            if event_expected:
                percent_completions[output] = max(percent_completions[output], event_count / float(event_expected))

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and "fractionpass" in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]["fractionpass"]
                print "overriding fraction to", fractions_pass[output], "for", output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to", fractions_pass[output], "by command line for", output

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name, "is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if has_recovery_going:
                sub_assistance += "-recovering"
            elif had_any_recovery:
                ## we want to have this looked at
                sub_assistance += "-manual"
            else:
                sub_assistance += "-recovery"
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request["OutputDatasets"]:
            events_per_lumi[output] = getDatasetEventsPerLumi(output)

        lumi_upper_limit = {}
        for output in wfi.request["OutputDatasets"]:
            upper_limit = 301.0
            campaign = get_campaign(output, wfi)
            # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]["lumisize"]
                print "overriding the upper lumi size to", upper_limit, "for", campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to", upper_limit, "by command line"

            lumi_upper_limit[output] = upper_limit

        if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name, "has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance += "-biglumi"
            is_closing = False

        any_presence = {}
        for output in wfi.request["OutputDatasets"]:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request["OutputDatasets"]:
            custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence = {}
        for output in wfi.request["OutputDatasets"]:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output)

        vetoed_custodial_tier = UC.get("tiers_with_no_custodial")
        out_worth_checking = [
            out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier
        ]
        size_worth_checking = sum(
            [getDatasetSize(out) / 1023.0 for out in out_worth_checking]
        )  ## size in TBs of all outputs
        if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name, "has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]):
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:", custodial, "because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]["custodial"]
                        print "Setting custodial to", custodial, "from campaign configuration"
                        break
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the unified configuration custodial:", custodial, "because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            if not custodial and "InputDataset" in wfi.request:
                ## this is terribly dangerous to assume only
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"])
                ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset", wfi.request[
                        "InputDataset"
                    ], "does not have custodial in the first place. abort"
                    sendEmail(
                        "dataset has no custodial location",
                        "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"],
                    )
                    is_closing = False
                    pick_custodial = False

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:", custodial, "because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for", wfo.name
                sendEmail(
                    "cannot find a custodial",
                    "cannot find a custodial for %s probably because of the total output size %d"
                    % (wfo.name, size_worth_checking),
                )

            if custodial and ((not sub_assistance and not acdc) or by_pass_checks):
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output] >= 1:
                            custodials[custodial].append(output)
                        else:
                            print "no file in phedex for", output, " not good to add to custodial requests"

            is_closing = False

        ## disk copy
        disk_copies = {}
        for output in wfi.request["OutputDatasets"]:
            disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)]

        if not all(map(lambda sites: len(sites) != 0, disk_copies.values())):
            print wfo.name, "has not all output on disk"
            print json.dumps(disk_copies, indent=2)

        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request["OutputDatasets"]:
            dbs_presence[output] = dbs3Client.getFileCountDataset(output)
            dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True)

        fraction_invalid = 0.01
        if (
            not all(
                [
                    dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out])
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if (
            not all(
                [
                    (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out]))
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance += "-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            print "starting duplicate checker for", wfo.name
            for output in wfi.request["OutputDatasets"]:
                print "\tchecking", output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi(output)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi(output)
                    except:
                        print "was not possible to get the duplicate count for", output
                        is_closing = False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name, "has duplicates"
                print json.dumps(duplications, indent=2)
                ## hook for making file invalidation ?
                sub_assistance += "-duplicates"
                is_closing = False

        ## for visualization later on
        if not wfo.name in fDB.record:
            # print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None}
        fDB.record[wfo.name]["closeOutWorkflow"] = is_closing
        for output in wfi.request["OutputDatasets"]:
            if not output in fDB.record[wfo.name]["datasets"]:
                fDB.record[wfo.name]["datasets"][output] = {}
            rec = fDB.record[wfo.name]["datasets"][output]
            rec["percentage"] = float("%.2f" % (percent_completions[output] * 100))
            rec["duplicate"] = duplications[output] if output in duplications else "N/A"
            rec["phedexReqs"] = (
                float("%.2f" % any_presence[output][custodial_presences[output][0]][1])
                if len(custodial_presences[output]) != 0
                else "N/A"
            )
            rec["closeOutDataset"] = is_closing
            rec["transPerc"] = (
                float("%.2f" % any_presence[output][disk_copies[output][0]][1])
                if len(disk_copies[output]) != 0
                else "N/A"
            )
            rec["correctLumis"] = (
                int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            )
            rec["missingSubs"] = (
                False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output])))
            )
            rec["dbsFiles"] = dbs_presence[output]
            rec["dbsInvFiles"] = dbs_invalid[output]
            rec["phedexFiles"] = phedex_presence[output]
            rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive))

        if by_pass_checks:
            ## force closing
            is_closing = True

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting", wfo.name, "closed-out"
            if not options.test:
                if wfo.wm_status in ["closed-out", "announced", "normal-archived"]:
                    print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer", res
                if not res in ["None", None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)

                if res in [None, "None"]:
                    wfo.status = "close"
                    session.commit()
                else:
                    print "could not close out", wfo.name, "will try again next time"
        else:
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            new_status = "assistance" + sub_assistance
            print wfo.name, "needs assistance with", new_status

            if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status:
                pid = wfi.getPrepIDs()[0].replace("task_", "")
                # pid = wfi.request['PrepID'].replace('task_','')
                ## notify
                messages = {
                    "recovery": "Samples completed with missing statistics:\n%s "
                    % (
                        "\n".join(
                            [
                                "%.2f %% complete for %s" % (percent_completions[output] * 100, output)
                                for output in wfi.request["OutputDatasets"]
                            ]
                        )
                    ),
                    "biglumi": "Samples completed with large luminosity blocks:\n%s "
                    % (
                        "\n".join(
                            [
                                "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output)
                                for output in wfi.request["OutputDatasets"]
                                if (events_per_lumi[output] > lumi_upper_limit[output])
                            ]
                        )
                    ),
                    "duplicate": "Samples completed with duplicated luminosity blocks:\n%s"
                    % (
                        "\n".join(
                            [
                                "%s" % output
                                for output in wfi.request["OutputDatasets"]
                                if output in duplications and duplications[output]
                            ]
                        )
                    ),
                }
                text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name)
                content = ""
                for case in messages:
                    if case in new_status:
                        content += "\n" + messages[case] + "\n"
                text += content
                text += "You are invited to check, while this is being taken care of by Ops.\n"
                text += "This is an automated message."
                if use_mcm and content:
                    print "Sending notification back to requestor"
                    print text
                    batches = mcm.getA("batches", query="contains=%s&status=announced" % pid)
                    if len(batches):
                        ## go notify the batch
                        bid = batches[-1]["prepid"]
                        print "batch nofication to", bid
                        mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid})

                    ## go notify the request
                    print "request notification to", pid
                    mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]})

            ## case where the workflow was in manual from recoveror
            if not "manual" in wfo.status or new_status != "assistance-recovery":
                wfo.status = new_status
                if not options.test:
                    print "setting", wfo.name, "to", wfo.status
                    session.commit()
            else:
                print "current status is", wfo.status, "not changing to anything"

    fDB.html()

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ",".join(custodials[site]), "=>", site
        if not options.test:
            result = makeReplicaRequest(
                url,
                site,
                list(set(custodials[site])),
                "custodial copy at production close-out",
                custodial="y",
                priority="low",
                approve=(site in SI.sites_auto_approve),
            )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ",".join(transfers[site]), "=>", site
        if not options.test:
            result = None
            # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations

Example #9

Show file

#!/usr/bin/env python
from utils import getDatasetPresence, getDatasetEventsAndLumis
import json
import sys

url = 'cmsweb.cern.ch'

ev, lumi = getDatasetEventsAndLumis(sys.argv[1])
if lumi:
    print ev, lumi, ev / float(lumi)
presence = getDatasetPresence(url, sys.argv[1])
print json.dumps(presence, indent=2)

Example #10

Show file

File: closor.py Project: lucacopa/WmAgentScripts

def closor(url, specific=None):
    CI = campaignInfo()

    ## manually closed-out workflows should get to close with checkor
    for wfo in session.query(Workflow).filter(Workflow.status=='close').all():

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wl = getWorkLoad(url, wfo.name)
        wfo.wm_status = wl['RequestStatus']

        if wl['RequestStatus'] in  ['announced','normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wl['RequestStatus']
            print wfo.name,"is done already",wfo.wm_status

        session.commit()

        if not 'TotalInputLumis' in wl:
            print wfo.name,"has not been assigned yet"
            continue

        expected_lumis = wl['TotalInputLumis']

        ## what are the outputs
        outputs = wl['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = []
        #print outputs
        if len(outputs): 
            print wfo.name,wl['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            fraction = wfo.fraction_for_closing
            fraction = 0.0
            all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))


        ## only that status can let me go into announced
        if wl['RequestStatus'] in ['closed-out']:
            print wfo.name,"to be announced"

            results=[]#'dummy']
            if not results:
                results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name))
                for (io,out) in enumerate(outputs):
                    if all_OK[io]:
                        results.append(setDatasetStatusDBS3.setStatusDBS3('https://cmsweb.cern.ch/dbs/prod/global/DBSWriter', out, 'VALID' ,''))
                        tier = out.split('/')[-1]
                        to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier))
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wl and wl['Campaign']:
                                campaign = wl['Campaign']
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True
                            
                        ## inject to DDM everything from ReDigi
                        if to_DDM:
                            print "Sending",out," to DDM"
                            subprocess.call(['python','assignDatasetToSite.py','--dataset='+out,'--exec'])
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append(None)
            
            #print results
            if all(map(lambda result : result in ['None',None],results)):
                wfo.status = 'done'
                session.commit()
                print wfo.name,"is announced"
            else:
                print "ERROR with ",wfo.name,"to be announced",json.dumps( results )
        else:
            print wfo.name,"not good for announcing:",wl['RequestStatus']

Example #11

Show file

def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(
            Workflow.status.contains('announce')).filter(
                sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    wfs_n = [w.name for w in wfs]
    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)

    held = set()

    print len(wfs), "closing"
    random.shuffle(wfs)
    max_per_round = UC.get('max_per_round').get('closor', None)
    if options.limit: max_per_round = options.limit

    if max_per_round:
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True),
                               key=lambda r: r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]

        def rank(wfn):
            return all_closedout.index(wfn) if wfn in all_closedout else 0

        wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    for iwfo, wfo in enumerate(wfs):

        if specific and not specific in wfo.name: continue

        print "Progress [%d/%d]" % (iwfo, len(wfs))
        ## what is the expected #lumis
        wfi = workflowInfo(url, wfo.name)
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url,
                                                   batch_name,
                                                   details=True)
                batch_go[batch_name] = all(
                    map(
                        lambda s: not s in [
                            'completed', 'running-open', 'running-closed',
                            'acquired', 'assigned', 'assignment-approved'
                        ], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog(
                    'closor',
                    'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'
                    % (batch_name, batch_name))
                continue

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived'
                                            ] and not options.force:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, wfo.wm_status))
        session.commit()

        if jump_the_line:
            wfi.sendLog('closor', 'Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis'] == 0:
            print wfo.name, "is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(
                Output.datasetname == out).first()
            if not odb:
                print "adding an output object", out
                odb = Output(datasetname=out)
                odb.workflow = wfo
                session.add(odb)
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()
            fraction = lumi_count / float(expected_lumis) * 100.

            completion_line = "%60s %d/%d = %3.2f%%" % (
                out, lumi_count, expected_lumis, fraction)
            wfi.sendLog('closor', "\t%s" % completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                batch_warnings[wfi.getCampaign()].add(completion_line)
            stats[out] = lumi_count
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence(url, out)
            where = [site for site, info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out, "is in full at", ",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites'] + wfi.request[
                    'CustodialSites']
                wfi.sendLog(
                    'closor', "%s is not in full anywhere. send to %s" %
                    (out, ",".join(sorted(going_to))))
                at_destination = dict([(k, v) for (k, v) in presence.items()
                                       if k in going_to])
                else_where = dict([(k, v) for (k, v) in presence.items()
                                   if not k in going_to])
                print json.dumps(at_destination)
                print json.dumps(else_where, indent=2)
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to=there)
                    for l in late_info:
                        l.update({"workflow": wfo.name, "dataset": out})
                    all_late_files.extend(late_info)
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

        ## verify if we have to do harvesting

        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update(OK)

        ## only that status can let me go into announced
        if all(all_OK.values()) and (
            (wfi.request['RequestStatus'] in ['closed-out']) or options.force
                or jump_the_line):
            print wfo.name, "to be announced"
            results = []
            if not results:
                for out in outputs:
                    if out in stats and not stats[out]:
                        continue
                    _, dsn, process_string, tier = out.split('/')

                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if destinations:
                            wfi.sendLog(
                                'closor', '%s to go to %s' %
                                (out, ', '.join(sorted(destinations))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(
                                url,
                                site, [out],
                                'Copy for release validation consumption',
                                priority='normal',
                                approve=True,
                                mail=False,
                                group='RelVal')
                            try:
                                request_id = result['phedex'][
                                    'request_created'][0]['id']
                                results.append(True)
                            except:
                                results.append('Failed relval transfer')

                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 1
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        group_spec = ""  ## not used yet
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending", out, " to DDM"
                            status = pass_to_dynamo(
                                [out],
                                N=n_copies,
                                sites=destinations if destinations else None,
                                group=group_spec if group_spec else None)
                            results.append(status)
                            if status == True:
                                wfi.sendLog(
                                    'closor',
                                    '%s is send to dynamo in %s copies %s %s' %
                                    (out, n_copies, sorted(destinations),
                                     group_spec))
                            else:
                                sendLog('closor',
                                        "could not add " + out +
                                        " to dynamo pool. check closor logs.",
                                        level='critical')
                                wfi.sendLog(
                                    'closor', "could not add " + out +
                                    " to dynamo pool. check closor logs.")
                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(
                            url, wfo.name)
                        if not res in ['None', None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            wfo.wm_status = wl_bis.request['RequestStatus']
                            session.commit()
                            if wl_bis.request['RequestStatus'] in [
                                    'announced', 'normal-archived'
                            ]:
                                res = None
                            else:
                                ## retry ?
                                res = reqMgrClient.announceWorkflowCascade(
                                    url, wfo.name)

                        results.append(res)

            #print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        wfo.status = wfo.status.replace(
                            'announce', 'announced')
                else:
                    wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor', "workflow outputs are announced")
            else:
                wfi.sendLog(
                    'closor', "Error with %s to be announced \n%s" %
                    (wfo.name, json.dumps(results)))

        elif wfi.request['RequestStatus'] in [
                'failed', 'aborted', 'aborted-archived', 'rejected',
                'rejected-archived', 'aborted-completed'
        ]:
            if wfi.isRelval():
                wfo.status = 'forget'
                wfo.wm_status = wfi.request['RequestStatus']
                wfi.sendLog(
                    'closor',
                    "%s is %s, but will not be set in trouble to find a replacement."
                    % (wfo.name, wfo.wm_status))
            else:
                wfo.status = 'trouble'
                wfo.wm_status = wfi.request['RequestStatus']
            session.commit()
        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            held.add(wfo.name)

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        sendLog('closor',
                "the workflows below are held up \n%s" %
                ("\n".join(sorted(held))),
                level='critical')

    #batches = json.loads(open('batches.json').read())
    for bname, go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s" % bname
            issues = ""
            if batch_warnings[bname]:
                issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness
                issues += "\n".join(sorted(batch_warnings[bname]))
                issues += "\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
""" % (bname, bname, issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to)

Example #12

Show file

File: closor.py Project: DAMason/WmAgentScripts

def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return


    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

    jump_the_line = options.announce if options else False
    if jump_the_line:
        wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    held = set()

    print len(wfs),"closing"
    max_per_round = UC.get('max_per_round').get('closor',None)
    if options.limit: max_per_round = options.limit
    random.shuffle( wfs )    
    if max_per_round: wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wfi = workflowInfo(url, wfo.name )
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url , batch_name, details=True)
                batch_go[ batch_name ]  = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name)
                continue


        if wfi.request['RequestStatus'] in  ['announced','normal-archived'] and not options.force:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status))
        session.commit()

        if jump_the_line:
            wfi.sendLog('closor','Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis']==0:
            print wfo.name,"is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda : False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs): 
            print wfo.name,wfi.request['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()
            fraction = lumi_count/float(expected_lumis)*100.

            completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction)
            wfi.sendLog('closor',"\t%s"% completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                batch_warnings[ wfi.getCampaign()].add( completion_line )
            stats[out] = lumi_count
            all_OK[out] = True 


        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence( url, out )
            where = [site for site,info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out,"is in full at",",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
                wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
                at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
                else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
                print json.dumps( at_destination )
                print json.dumps( else_where, indent=2 )
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to = there )
                    for l in late_info:
                        l.update({"workflow":wfo.name,"dataset":out})
                    all_late_files.extend( late_info )
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

    
        ## verify if we have to do harvesting

        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line):
            print wfo.name,"to be announced"
            results=[]
            if not results:
                for out in outputs:
                    if out in stats and not stats[out]: 
                        continue
                    _,dsn,process_string,tier = out.split('/')

                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        
                        if destinations:
                            wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations ))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal')
                            try:
                                request_id =  result['phedex']['request_created'][0]['id']
                                results.append( True )
                            except:
                                results.append( 'Failed relval transfer' )
                        
                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request['Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"):
                            print "tier",tier,"neither TO or NO DDM for",out
                            results.append('Not recognitized tier %s'%tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical')
                            continue

                        n_copies = 2
                        destinations=[]
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]:
                            ddm_instructions = CI.campaigns[campaign]['DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier,indication in ddm_instructions.items():
                                    if ddmtier==tier or ddmtier in ['*','all']:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']
                                            
                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination="+",".join( destinations )
                        group_spec = "" ## not used yet 
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending",out," to DDM"
                            p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec))
                            ddm_text = p.read()
                            print ddm_text
                            status = p.close()
                            if status!=None:
                                print "Failed DDM, retrying to send",out,"a second time"
                                p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec))

                                ddm_text = p.read()
                                print ddm_text
                                status = p.close()    
                                if status!=None:
                                    #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical')
                                    if options.force: status = True
                            results.append( status )
                            if status == None:
                                wfi.sendLog('closor',ddm_text)
                                wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec))
                                                            
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                        if not res in ['None',None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            wfo.wm_status = wl_bis.request['RequestStatus']
                            session.commit()
                            if wl_bis.request['RequestStatus'] in  ['announced','normal-archived']:
                                res = None
                            else:
                                ## retry ?
                                res = reqMgrClient.announceWorkflowCascade(url, wfo.name) 
                            
                        results.append( res )
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        wfo.status = wfo.status.replace('announce','announced')
                else:
                    wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor',"workflow outputs are announced")
            else:
                wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results )))
            
        elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            if wfi.isRelval():
                wfo.status = 'forget'
                wfo.wm_status = wfi.request['RequestStatus']
                wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status))
            else:
                wfo.status = 'trouble'
                wfo.wm_status = wfi.request['RequestStatus']
            session.commit()
        else:
            print wfo.name,"not good for announcing:",wfi.request['RequestStatus']
            wfi.sendLog('closor',"cannot be announced")
            held.add( wfo.name )

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')


    #batches = json.loads(open('batches.json').read())
    for bname,go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s"% bname
            issues=""
            if batch_warnings[ bname ]:
                issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
                issues+="\n".join( sorted( batch_warnings[ bname ] ))
                issues+="\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
"""%( bname, 
      bname,
      issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to )

Example #13

Show file

File: closor.py Project: CMSCompOps/WmAgentScripts

    def close(self):
        if os.path.isfile('.closor_stop'):
            print "The closing of workflows is shortened"
            return 

        url = self.url
        batch_go = self.batch_go
        CI = self.CI
        UC = self.UC
        wfo = self.wfo

        jump_the_line = self.jump_the_line
        batch_goodness = self.batch_goodness
        check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

        ## what is the expected #lumis 
        self.wfi = workflowInfo(url, wfo.name )
        wfi = self.wfi
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url , batch_name, details=True)
                batch_go[ batch_name ]  = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog('closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'%( batch_name, batch_name))
                return


        if wfi.request['RequestStatus'] in  ['announced','normal-archived'] and not options.force:
            ## manually announced ??
            self.to_status = 'done'
            self.to_wm_status = wfi.request['RequestStatus']
            wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,self.to_wm_status))
            return 

        if jump_the_line:
            wfi.sendLog('closor','Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis']==0:
            print wfo.name,"is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda : False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs): 
            print wfo.name,wfi.request['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            self.outs.append( Output( datasetname = out ))
            odb = self.outs[-1]
            odb.workflow = wfo
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())

            fraction = lumi_count/float(expected_lumis)*100.

            completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction)
            wfi.sendLog('closor',"\t%s"% completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                self.batch_warnings[ wfi.getCampaign()].add( completion_line )
            stats[out] = lumi_count
            all_OK[out] = True 


        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence( url, out )
            where = [site for site,info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out,"is in full at",",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
                wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
                at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
                else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
                print json.dumps( at_destination )
                print json.dumps( else_where, indent=2 )
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to = there )
                    for l in late_info:
                        l.update({"workflow":wfo.name,"dataset":out})
                    self.all_late_files.extend( late_info )
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

    
        ## verify if we have to do harvesting
        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line):
            print wfo.name,"to be announced"
            results=[]
            if not results:
                for out in outputs:
                    print "dealing with",out
                    if out in stats and not stats[out]: 
                        continue
                    _,dsn,process_string,tier = out.split('/')

                    if all_OK[out]:
                        print "setting valid"
                        results.append(setDatasetStatus(out, 'VALID', withFiles=False))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        
                        if destinations:
                            wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations ))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal')
                            try:
                                request_id =  result['phedex']['request_created'][0]['id']
                                results.append( True )
                            except:
                                results.append( 'Failed relval transfer' )
                        
                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request['Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"):
                            print "tier",tier,"neither TO or NO DDM for",out
                            results.append('Not recognitized tier %s'%tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical')
                            continue

                        n_copies = 1
                        destinations=[]
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]:
                            ddm_instructions = CI.campaigns[campaign]['DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier,indication in ddm_instructions.items():
                                    if ddmtier==tier or ddmtier in ['*','all']:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']
                                            
                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination="+",".join( destinations )
                        group_spec = "" ## not used yet 
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending",out," to DDM"
                            status = pass_to_dynamo( [out], N = n_copies, sites=destinations if destinations else None, group = group_spec if group_spec else None)
                            results.append( status )
                            if status == True:
                                wfi.sendLog('closor','%s is send to dynamo in %s copies %s %s'%( out, n_copies, sorted(destinations), group_spec))
                            else:
                                sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.", level='critical')
                                wfi.sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.")
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                        if not res in ['None',None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            self.to_wm_status = wl_bis.request['RequestStatus']
                            if wl_bis.request['RequestStatus'] in  ['announced','normal-archived']:
                                res = None
                            else:
                                res = reqMgrClient.announceWorkflowCascade(url, wfo.name) 
                            
                        results.append( res )
                                
            print results
            if all(map(lambda result : result in ['None',None,True],results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        self.to_status = wfo.status.replace('announce','announced')
                else:
                    self.to_status = 'done'
                    self.closing = True
                
                    
                wfi.sendLog('closor',"workflow outputs are announced")
            else:
                wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results )))
            
        elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            if wfi.isRelval():
                self.to_status = 'forget'
                self.to_wm_status = wfi.request['RequestStatus']
                wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, self.to_wm_status))
            else:
                self.to_status = 'trouble'
                self.to_wm_status = wfi.request['RequestStatus']
        else:
            print wfo.name,"not good for announcing:",wfi.request['RequestStatus']
            wfi.sendLog('closor',"cannot be announced")
            self.held.add( wfo.name )

Example #14

Show file

File: checkor.py Project: julianbadillo/WmAgentScripts

def checkor(url, spec=None, options=None):
    fDB = closeoutInfo()

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.fetch:
        ## get all in running and check
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )
    if options.nofetch:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    by_passes = []
    holdings = []
    for bypassor,email in [('jbadillo','*****@*****.**'),('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            print "no file",bypass_file
            continue
        try:
            by_passes.extend( json.loads(open(bypass_file).read()))
        except:
            print "cannot get by-passes from",bypass_file,"for",bypassor
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            print "no file",holding_file
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            print "cannot get holdings from",holding_file,"for",bypassor
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])


    total_running_time = 5.*60. 
    sleep_time = max(0.5, total_running_time / len(wfs))

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        print "checking on",wfo.name
        
        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            print wfo.name,"is already",wfo.wm_status
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            print wfo.name,"is in trouble",wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            print wfo.name,"not running yet"
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings:
                print wfo.name,"on hold"
                continue
        if wfo.name in holdings:
            wfo.status = 'assistance-onhold'
            print "setting",wfo.name,"on hold"
            session.commit()
            continue
        
        if wfo.wm_status != 'completed':
            ## for sure move on with closeout check if in completed
            print "no need to check on",wfo.name,"in status",wfo.wm_status
            session.commit()
            continue

        session.commit()        
        sub_assistance="" # if that string is filled, there will be need for manual assistance

        is_closing = True
        ## do the closed-out checks one by one

        ## get it from somewhere
        by_pass_checks = False
        if wfo.name in by_passes:
            print "we can bypass checks on",wfo.name
            by_pass_checks = True
        for bypass in by_passes:
            if bypass in wfo.name:
                print "we can bypass",wfo.name,"because of keyword",bypass
                by_pass_checks = True
                break

        # tuck out DQMIO/DQM
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        has_recovery_going=False
        had_any_recovery = False
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assignment-approved','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                #print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False
                has_recovery_going=True
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                had_any_recovery = True
        ## completion check
        percent_completions = {}
#        print "let's see who is crashing", wfo.name
#        print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        fractions_pass = {}
        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.
            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                print "overriding fraction to",fractions_pass[output],"for",output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if has_recovery_going:
                sub_assistance+='-recovering'
            elif had_any_recovery:
                ## we want to have this looked at
                sub_assistance+='-manual'
            else:
                sub_assistance+='-recovery'
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = get_campaign(output, wfi)
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
        
        if any([ events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance+='-biglumi'
            is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        vetoed_custodial_tier = ['MINIAODSIM']
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"
                        break
            ## get from the parent
            pick_custodial = True
            if not custodial and 'InputDataset' in wfi.request:
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( wfi.request['InputDataset'])
                ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort"
                    sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%wfi.request['InputDataset'])
                    is_closing = False
                    pick_custodial = False

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE()

            if custodial and ((not sub_assistance and not acdc) or by_pass_checks):
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            else:
                print "cannot find a custodial for",wfo.name
            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance+="-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output )
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output )
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                sub_assistance+='-duplicates'
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        if by_pass_checks:
            ## force closing
            is_closing = True

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                print "close out answer",res
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            new_status = 'assistance'+sub_assistance
            print wfo.name,"needs assistance with",new_status
            
            if sub_assistance and wfo.status != new_status and 'PrepID' in wfi.request and not 'manual' in wfo.status:
                pid = wfi.getPrepIDs()[0].replace('task_','')
                #pid = wfi.request['PrepID'].replace('task_','')
                ## notify
                messages= {
                    'recovery' : 'Samples completed with missing lumi count:\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ) ),
                    'biglumi' : 'Samples completed with large luminosity blocks:\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])])),
                    'duplicate' : 'Samples completed with duplicated luminosity blocks:\n%s'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    }
                text ="The request %s (%s) is facing issue in production.\n" %( pid, wfo.name )
                content = ""
                for case in messages:
                    if case in new_status:
                        content+= "\n"+messages[case]+"\n"
                text += content
                text += "You are invited to check, while this is being taken care of by Ops.\n"
                text += "This is an automated message."
                if use_mcm and content:
                    print "Sending notification back to requestor"
                    print text
                    batches = mcm.getA('batches',query='contains=%s&status=announced'%pid)
                    if len(batches):
                        ## go notify the batch
                        bid = batches[-1]['prepid']
                        print "batch nofication to",bid
                        mcm.put('/restapi/batches/notify', { "notes" : text, "prepid" : bid})


                    ## go notify the request
                    print "request notification to",pid
                    mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid] })

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"


    fDB.html()

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations

Example #15

Show file

File: closor.py Project: menglu21/WmAgentScripts

def closor(url, specific=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')
    ## manually closed-out workflows should get to close with checkor
    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    held = set()

    max_per_round = UC.get('max_per_round').get('closor', None)
    random.shuffle(wfs)
    if max_per_round: wfs = wfs[:max_per_round]

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis
        wfi = workflowInfo(url, wfo.name)
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, wfo.wm_status))
        session.commit()

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(
                Output.datasetname == out).first()
            if not odb:
                print "adding an output object", out
                odb = Output(datasetname=out)
                odb.workflow = wfo
                session.add(odb)
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            wfi.sendLog(
                'closor', "\t%60s %d/%d = %3.2f%%" %
                (out, lumi_count, expected_lumis,
                 lumi_count / float(expected_lumis) * 100.))
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            #fraction = wfo.fraction_for_closing
            #fraction = 0.0
            #all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence(url, out)
            where = [site for site, info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out, "is in full at", ",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites'] + wfi.request[
                    'CustodialSites']
                wfi.sendLog(
                    'closor', "%s is not in full anywhere. send to %s" %
                    (out, ",".join(sorted(going_to))))
                at_destination = dict([(k, v) for (k, v) in presence.items()
                                       if k in going_to])
                else_where = dict([(k, v) for (k, v) in presence.items()
                                   if not k in going_to])
                print json.dumps(at_destination)
                print json.dumps(else_where, indent=2)
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to=there)
                    for l in late_info:
                        l.update({"workflow": wfo.name, "dataset": out})
                    all_late_files.extend(late_info)
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

        ## verify if we have to do harvesting

        (OK, requests) = spawn_harvesting(url, wfi, in_full)
        all_OK.update(OK)

        ## only that status can let me go into announced
        if all(all_OK.values()) and wfi.request['RequestStatus'] in [
                'closed-out'
        ]:
            print wfo.name, "to be announced"
            results = []  #'dummy']
            if not results:
                for out in outputs:
                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True
                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 2
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        ## inject to DDM when necessary
                        if to_DDM:
                            #print "Sending",out," to DDM"
                            p = os.popen(
                                'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'
                                % (n_copies, out, destination_spec))
                            print p.read()
                            status = p.close()
                            if status != None:
                                print "Failed DDM, retrying a second time"
                                p = os.popen(
                                    'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'
                                    % (n_copies, out, destination_spec))
                                print p.read()
                                status = p.close()
                                if status != None:
                                    #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    sendLog('closor',
                                            "could not add " + out +
                                            " to DDM pool. check closor logs.",
                                            level='critical')
                            results.append(status)
                            if status == None:
                                wfi.sendLog(
                                    'closor',
                                    '%s is send to AnalysisOps DDM pool in %s copies %s'
                                    % (n_copies, out, destination_spec))

                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    ## only announce if all previous are fine
                    res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                    if not res in ['None', None]:
                        ## check the status again, it might well have toggled
                        wl_bis = workflowInfo(url, wfo.name)
                        wfo.wm_status = wl_bis.request['RequestStatus']
                        session.commit()
                        if wl_bis.request['RequestStatus'] in [
                                'announced', 'normal-archived'
                        ]:
                            res = None
                        else:
                            ## retry ?
                            res = reqMgrClient.announceWorkflowCascade(
                                url, wfo.name)

                    results.append(res)

            #print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor', "workflow is announced")
            else:
                print "ERROR with ", wfo.name, "to be announced", json.dumps(
                    results)

        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            held.add(wfo.name)

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
        sendLog('closor',
                "the workflows below are held up \n%s" % ("\n".join(held)),
                level='critical')

Example #16

Show file

File: checkor.py Project: amaltaro/WmAgentScripts

def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    holdings = []
    #try:
    #    already_notified = json.loads(open('already_notifified.json').read())
    #except:
    #    print "no record of already notified workflow. starting fresh"
    #    already_notified = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        bypasses.extend( mcm_force )

    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False
        pids = wfi.getPrepIDs()
        bypass_by_mcm = False
        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
            if bypass in pids:
                wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass))
                bypass_checks = True
                bypass_by_mcm = True
                break
        
        #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks:
        #    print "No go for",wfo.name
        #    wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign'])
        #    continue


        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = int(wfi.request['Task1']['RequestNumEvents'])

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )
            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        print "These %d files are missing in phedex"%(len(missing_phedex))
                        print "\n".join( missing_phedex )
                    if missing_dbs:
                        print "These %d files are missing in dbs"%(len(missing_dbs))
                        print "\n".join( missing_dbs )

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and bypass_by_mcm:
                        ## shoot large on all prepids
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations

Example #17

Show file

File: checkor.py Project: dabercro/WmAgentScripts

def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock() and not options.go:  return

    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())
    
    runnings = session.query(Workflow).filter(Workflow.status == 'away').all()
    standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()

    ## intersect with what is actually in completed status in request manager now
    all_completed = set(getWorkflows(url, 'completed' ))

    wfs=[]

    if options.strict:
        ## the one which were running and now have completed
        print "strict option is on: checking workflows that freshly completed"
        wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings))
    if options.update:
        print "update option is on: checking workflows that have not completed yet"
        wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings))

    if options.clear:
        print "clear option is on: checking workflows that are ready to toggle closed-out"
        wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings))
    if options.review:
        print "review option is on: checking the workflows that needed intervention"
        wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings))

    ## what is left out are the wf which were running and ended up aborted/failed/...

    

    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False) if use_mcm else None

    def get_campaign(output, wfi):
        ## this should be a perfect matching of output->task->campaign
        campaign = None
        era = None
        wf_campaign = None
        if 'Campaign' in wfi.request:   wf_campaign = wfi.request['Campaign']
        try:
            era = output.split('/')[2].split('-')[0]
        except:
            era = None
            
        if wfi.isRelval(): 
            campaign = wf_campaign
        else:
            campaign = era if era else wf_campaign
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    
    actors = UC.get('allowed_bypass')

    for bypassor,email in actors:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            extending = json.loads(open(holding_file).read())
            print bypassor,"is holding",extending
            holdings.extend( extending )
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in actors:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        #if forcings:
        #    sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    in_manual = 0

    ## now you have a record of what file was invalidated globally from TT
    TMDB_invalid = dataCache.get('file_invalidation') 
    #try:
    #    TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))])
    #    TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid)
    #    print len(TMDB_invalid),"globally invalidated files"
    #except Exception as e:
    #    print "TMDB not fetched"
    #    print str(e)
    #    TMDB_invalid = []


    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if options.limit: max_per_round=options.limit
    if max_per_round and not spec: wfs = wfs[:max_per_round]



    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        
        time.sleep( sleep_time )
        
        time_point("Starting with %s"% wfo.name)

        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM'))
        campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c 
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        true_familly = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['PrepID'] != wfi.request['PrepID'] : continue
            #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue

            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical')
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue

            true_familly.append( member['RequestName'] )
            #try:
            #    parse_one(url, member['RequestName'])
            #except:
            #    print "Could not make error report for",member['RequestName']

            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))
            sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical')

        time_point("checked workflow familly", sub_lap=True)


        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        events_per_lumi = {}

        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        time_point("execpted statistics", sub_lap=True)

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100
                
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            default_pass = UC.get('default_fraction_pass')
            fractions_pass[output] = default_pass
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                if type(CI.campaigns[c]['fractionpass']) == dict:
                    tier = output.split('/')[-1]
                    priority = str(wfi.request['RequestPriority'])
                    ## defined per tier
                    fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass)
                    if tier in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier]
                    if priority in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority]
                else:
                    fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical')
                #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**'])
                ## do not bypass for now, until Alan understands why we are loosing ACDC docs 
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        time_point("checked output size", sub_lap=True)

        ## correct lumi < 300 event per lumi
        #for output in wfi.request['OutputDatasets']:
        #events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        time_point("checked dataset presence", sub_lap=True)

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        time_point("checked custodiality", sub_lap=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        time_point("checked phedex count", sub_lap=True)


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            group = None
            if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]:
                group = CI.campaigns[campaign]['phedex_group']
                print "using group",group,"for replica"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit")
                
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)


            if custodial and size_worht_going_to_ddm > tape_size_limit:
                print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit))
                custodial = None

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"

                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output))
                            custodials[custodial].append( output )
                            if group: custodials[custodial][-1]+='@%s'%group
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        time_point("determined tape location", sub_lap=True)

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        
        time_point("dbs file count", sub_lap=True)

        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n"
            mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n"
            mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n"
            mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n"

            wfi.sendLog('checkor',mismatch_notice)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                                                                          "\n".join( missing_phedex )))
                        were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
                            sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated),
                                                                                                                          "\n".join(were_invalidated)), level='critical')
                            dbs3Client.setFileStatus( were_invalidated, newstatus=0 )
                                
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))
                        were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False
        
        time_point("checked file count", sub_lap=True)

        fraction_invalid = 0.20
        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        files_per_rl = {}
        for output in wfi.request['OutputDatasets']:
            duplications[output] = "skiped"
            files_per_rl[output] = "skiped"

        time_point("checked invalidation", sub_lap=True)

        if (is_closing or bypass_checks) and (not options.ignoreduplicates):
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                    except Exception as e:
                        wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output))
                        sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical')
                        is_closing=False

            if is_closing and any(duplications.values()) and not options.ignoreduplicates:
                duplicate_notice = ""
                duplicate_notice += "%s has duplicates\n"%wfo.name
                duplicate_notice += json.dumps( duplications,indent=2)
                duplicate_notice += '\n'
                duplicate_notice += json.dumps( files_per_rl, indent=2)
                wfi.sendLog('checkor',duplicate_notice)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 


        time_point("checked duplicates", sub_lap=True)

        time_point("done with %s"%wfo.name)

        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            #rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            rec['familly'] = true_familly
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## make the lumi summary 
        if wfi.request['RequestType'] == 'ReReco':
            try:
                os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID']))
                os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID']))
                wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID']))
            except Exception as e:
                print str(e)
        ## make the error report
        
    
        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            wfi.sendLog('checkor',"setting %s closed-out"% wfo.name)
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            if not 'custodial' in assistance_tags or wfi.isRelval():
                ## do only the report for those
                for member in acdc+acdc_inactive+[wfo.name]:
                    try:
                        parse_one(url, member)
                    except:
                        print "Could not make error report for",member

            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that had ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')
                in_manual += 1
            if 'recovery' in assistance_tags and 'manual' in assistance_tags:
                ## this is likely because something bad is happening, so leave it to manual
                assistance_tags = assistance_tags - set(['recovery'])
                assistance_tags.add('manual')
                in_manual += 1

            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name)
                ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                perflink = '%s/report/%s'%(unified_url,wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status))
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec and in_manual!=0:
        sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        items_at = defaultdict(set)
        for i in custodials[site]:
            item, group = i.split('@') if '@' in i else (i,'DataOps')
            items_at[group].add( item )
        for group,items in items_at.items():
            print ','.join(items),'=>',site,'@',group
            if not options.test:
                result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group)
                print result

    print "File Invalidation"
    print invalidations

Example #18

Show file

File: checkor.py Project: menglu21/WmAgentScripts

def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        if forcings:
            sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if max_per_round and not spec: wfs = wfs[:max_per_round]

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue
            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                    "\n".join( missing_phedex )))
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations

Example #19

Show file

    def close(self):
        if os.path.isfile('.closor_stop'):
            print "The closing of workflows is shortened"
            return

        url = self.url
        batch_go = self.batch_go
        CI = self.CI
        UC = self.UC
        wfo = self.wfo

        jump_the_line = self.jump_the_line
        batch_goodness = self.batch_goodness
        check_parentage_to_announce = UC.get('check_parentage_to_announce')
        check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

        ## what is the expected #lumis
        self.wfi = workflowInfo(url, wfo.name)
        wfi = self.wfi
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url,
                                                   batch_name,
                                                   details=True)
                batch_go[batch_name] = all(
                    map(
                        lambda s: not s in [
                            'completed', 'running-open', 'running-closed',
                            'acquired', 'staged', 'staging', 'assigned',
                            'assignment-approved'
                        ], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog(
                    'closor',
                    'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'
                    % (batch_name, batch_name))
                return

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived'
                                            ] and not options.force:
            ## manually announced ??
            self.to_status = 'done'
            self.to_wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, self.to_wm_status))
            return

        if jump_the_line:
            wfi.sendLog('closor', 'Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis'] == 0:
            print wfo.name, "is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            self.outs.append(Output(datasetname=out))
            odb = self.outs[-1]
            odb.workflow = wfo
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())

            fraction = lumi_count / float(expected_lumis) * 100.

            completion_line = "%60s %d/%d = %3.2f%%" % (
                out, lumi_count, expected_lumis, fraction)
            wfi.sendLog('closor', "\t%s" % completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                self.batch_warnings[wfi.getCampaign()].add(completion_line)
                if fraction < 50:
                    self.batch_extreme_warnings[wfi.getCampaign()].add(
                        completion_line)
            stats[out] = lumi_count
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        #in_full = {}
        for out in outputs:
            all_OK[out] = True
            #in_full[out] = []
            #presence = getDatasetPresence( url, out )
            #where = [site for site,info in presence.items() if info[0]]
            #if where:
            #    all_OK[out] = True
            #    print out,"is in full at",",".join(where)
            #    in_full[out] = copy.deepcopy(where)
            #else:

            #    going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
            #    wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
            #    at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
            #    else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
            #    print json.dumps( at_destination )
            #    print json.dumps( else_where, indent=2 )
            ## do the full stuck transfer study, missing files and shit !
            #for there in going_to:
            #    late_info = findLateFiles(url, out, going_to = there )
            #    for l in late_info:
            #        l.update({"workflow":wfo.name,"dataset":out})
            #    self.all_late_files.extend( late_info )
            #    if check_fullcopy_to_announce:
            ## only set this false if the check is relevant
            #        all_OK[out] = False

        ## verify if we have to do harvesting
        #if not options.no_harvest and not jump_the_line:
        #    #(OK, requests) = spawn_harvesting(url, wfi, in_full)
        #    sites_for_DQMHarvest = UC.get("sites_for_DQMHarvest")
        #    (OK, requests) = spawn_harvesting(url, wfi, sites_for_DQMHarvest)
        #    print "Harvesting workflow has been created and assigned to: "
        #    print sites_for_DQMHarvest
        #    all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and (
            (wfi.request['RequestStatus'] in ['closed-out']) or options.force
                or jump_the_line):
            print wfo.name, "to be announced"
            results = []
            if not results:
                for out in outputs:
                    print "dealing with", out
                    if out in stats and not stats[out]:
                        continue
                    _, dsn, process_string, tier = out.split('/')

                    if all_OK[out]:
                        print "setting valid"
                        results.append(
                            setDatasetStatus(out, 'VALID', withFiles=False))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier in UC.get("tiers_to_rucio_relval"):
                            wfi.sendLog(
                                'closor',
                                "Data Tier: %s is blacklisted, so skipping dataset placement for: %s"
                                % (tier, out))
                            continue
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if destinations:
                            wfi.sendLog(
                                'closor', '%s to go to %s' %
                                (out, ', '.join(sorted(destinations))))

                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_rucio_nonrelval"):
                            wfi.sendLog(
                                'closor',
                                "Data Tier: %s is blacklisted, so skipping dataset placement for: %s"
                                % (tier, out))
                            continue

                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 1
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        group_spec = ""  ## not used yet
                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                # adding check for PrentageResolved flag from ReqMgr:
                if wfi.request[
                        'RequestType'] == 'StepChain' and check_parentage_to_announce:
                    if wfi.request['ParentageResolved']:
                        results.append(True)
                    else:
                        wfi.sendLog(
                            'closor',
                            "Delayed announcement of %s due to unresolved Parentage dependencies"
                            % wfi.request['RequestName'])
                        results.append('No ParentageResolved')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(
                            url, wfo.name)
                        if not res in ['None', None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            self.to_wm_status = wl_bis.request['RequestStatus']
                            if wl_bis.request['RequestStatus'] in [
                                    'announced', 'normal-archived'
                            ]:
                                res = None
                            else:
                                res = reqMgrClient.announceWorkflowCascade(
                                    url, wfo.name)

                        results.append(res)

            print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        self.to_status = wfo.status.replace(
                            'announce', 'announced')
                else:
                    self.to_status = 'done'
                    self.closing = True

                wfi.sendLog('closor', "workflow outputs are announced")
            else:
                wfi.sendLog(
                    'closor', "Error with %s to be announced \n%s" %
                    (wfo.name, json.dumps(results)))

        elif wfi.request['RequestStatus'] in [
                'failed', 'aborted', 'aborted-archived', 'rejected',
                'rejected-archived', 'aborted-completed'
        ]:
            if wfi.isRelval():
                self.to_status = 'forget'
                self.to_wm_status = wfi.request['RequestStatus']
                wfi.sendLog(
                    'closor',
                    "%s is %s, but will not be set in trouble to find a replacement."
                    % (wfo.name, self.to_wm_status))
            else:
                self.to_status = 'trouble'
                self.to_wm_status = wfi.request['RequestStatus']
        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            self.held.add(wfo.name)

Example #20

Show file

def completor(url, specific):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    ## by workflow a list of fraction / timestamps
    completions = json.loads(open('%s/completions.json' % monitor_dir).read())

    good_fractions = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']

    long_lasting = {}

    overrides = {}
    for rider, email in [('vlimant', '*****@*****.**'),
                         ('jen_a', '*****@*****.**'),
                         ('srimanob', '*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json' % (
            rider[0], rider)
        if not os.path.isfile(rider_file):
            print "no file", rider_file
            continue
        try:
            overrides[rider] = json.loads(open(rider_file).read())
        except:
            print "cannot get force complete list from", rider
            sendEmail("malformated force complet file",
                      "%s is not json readable" % rider_file,
                      destination=[email])

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print json.dumps(overrides, indent=2)
    max_force = 5

    wfs_no_location_in_GQ = set()

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)

        skip = False
        if not any([c in wfo.name for c in good_fractions]): skip = True
        for user, spec in overrides.items():
            #print spec
            if wfo.name in spec and wfi.request[
                    'RequestStatus'] != 'force-complete':
                #skip=False ## do not do it automatically yet
                sendEmail(
                    'force-complete requested',
                    '%s is asking for %s to be force complete' %
                    (user, wfo.name))
                wfi = workflowInfo(url, wfo.name)
                forceComplete(url, wfi)
                skip = True
                wfi.notifyRequestor(
                    "The workflow %s was force completed by request of %s" %
                    (wfo.name, user),
                    do_batch=False)
                wfi.sendLog(
                    'completor', '%s is asking for %s to be force complete' %
                    (user, wfo.name))
                break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.

        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request:
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60. * 60. * 24.)
        delay = now - then  ## in days

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority
        if delay <= 7: continue
        if delay >= 7:
            long_lasting[wfo.name] = {"delay": delay}
            pass

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue  ## that does not count
            if not output in completions:
                completions[output] = {
                    'injected': None,
                    'checkpoints': [],
                    'workflow': wfo.name
                }
            ## get completion fraction
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion = 0.
            event_completion = 0.
            if lumi_expected:
                lumi_completion = lumi_count / float(lumi_expected)
            if event_expected:
                event_completion = event_count / float(event_expected)

            #take the less optimistic
            percent_completions[output] = min(lumi_completion,
                                              event_completion)
            completions[output]['checkpoints'].append((now, event_completion))

        if all([
                percent_completions[out] >= good_fraction
                for out in percent_completions
        ]):
            print "all is above", good_fraction, "for", wfo.name
            print json.dumps(percent_completions, indent=2)
        else:
            print "\t", percent_completions.values(
            ), "not over bound", good_fraction
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            #print json.dumps( percent_completions, indent=2 )

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            print json.dumps(long_lasting[wfo.name]['agents'], indent=2)

            ## pick up on possible issue with global queue data location
            locs = wfi.getGQLocations()
            for b, loc in locs.items():
                if not loc:
                    print b, "has no location for GQ in", wfi.request[
                        'RequestName']
                    ## this is severe !
                    wfs_no_location_in_GQ.add(wfo.name)
                ## check location and site white list
                can_run = set([SI.SE_to_CE(se) for se in loc]) & set(
                    wfi.request['SiteWhitelist'])
                if loc and not can_run:
                    print b, "is missing site to run within the whitelist"
                    wfs_no_location_in_GQ.add(wfo.name)
                can_run = can_run & set(SI.sites_ready)
                if loc and not can_run:
                    print b, "is missing available site to run"
                    wfs_no_location_in_GQ.add(wfo.name)
            continue

        if all([
                percent_completions[out] >= ignore_fraction
                for out in percent_completions
        ]):
            print "all is done, just wait a bit"
            continue

        for output in percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
        print "Required:", cpuh,
        print "Time spend:", delay

        # only really force complete after n days
        if delay <= 14: continue
        print "going for force-complete of", wfo.name
        ## find ACDCs that might be running
        if max_force > 0:
            forceComplete(url, wfi)
            max_force -= 1
        else:
            print "too many completion this round"
        ## do it once only for testing
        #break

    open('%s/completions.json' % monitor_dir,
         'w').write(json.dumps(completions, indent=2))
    text = "These have been running for long"

    open('%s/longlasting.json' % monitor_dir,
         'w').write(json.dumps(long_lasting, indent=2))

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    if wfs_no_location_in_GQ:
        sendEmail(
            'workflow with no location in GQ',
            "there won't be able to run anytime soon\n%s" %
            ('\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text