Esempio n. 1
0
def forceComplete(url, wfi):
    familly = getWorkflowById( url, wfi.request['PrepID'] ,details=True)
    for member in familly:
        ### if member['RequestName'] == wl['RequestName']: continue ## set himself out
        if member['RequestDate'] < wfi.request['RequestDate']: continue
        if member['RequestStatus'] in ['None',None]: continue
        ## then set force complete all members
        if member['RequestStatus'] in ['running-opened','running-closed']:
            #sendEmail("force completing","%s is worth force completing\n%s"%( member['RequestName'] , percent_completions))
            print "setting",member['RequestName'],"force-complete"
            reqMgrClient.setWorkflowForceComplete(url, member['RequestName'])
        elif member['RequestStatus'] in ['acquired','assignment-approved']:
            print "rejecting",member['RequestName']
            reqMgrClient.invalidateWorkflow(url, member['RequestName'], current_status=member['RequestStatus'])
Esempio n. 2
0
def forceComplete(url, wfi):
    familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
    for member in familly:
        ### if member['RequestName'] == wl['RequestName']: continue ## set himself out
        if member['RequestDate'] < wfi.request['RequestDate']: continue
        if member['RequestStatus'] in ['None', None]: continue
        ## then set force complete all members
        if member['RequestStatus'] in ['running-opened', 'running-closed']:
            #sendEmail("force completing","%s is worth force completing\n%s"%( member['RequestName'] , percent_completions))
            print "setting", member['RequestName'], "force-complete"
            reqMgrClient.setWorkflowForceComplete(url, member['RequestName'])
        elif member['RequestStatus'] in ['acquired', 'assignment-approved']:
            print "rejecting", member['RequestName']
            reqMgrClient.invalidateWorkflow(
                url,
                member['RequestName'],
                current_status=member['RequestStatus'])
Esempio n. 3
0
def completor(url, specific):
    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        ## get all of the same
        wl = getWorkLoad(url, wfo.name)
        familly = getWorkflowById( url, wl['PrepID'] ,details=True)
        for member in familly:
            ### if member['RequestName'] == wl['RequestName']: continue
            if member['RequestDate'] < wl['RequestDate']: continue
            if member['RequestStatus'] in ['None',None]: continue
            ## then set force complete all members
            if member['RequestStatus'] in ['running-opened','running-closed']:
                print "setting",member['RequestName'],"force-complete"
                reqMgrClient.setWorkflowForceComplete(url, member['RequestName'])
Esempio n. 4
0
def completor(url, specific):
    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
    wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance")).all())

    for wfo in wfs:
        if specific and not specific in wfo.name:
            continue

        ## get all of the same
        wl = getWorkLoad(url, wfo.name)
        familly = getWorkflowById(url, wl["PrepID"], details=True)
        for member in familly:
            ### if member['RequestName'] == wl['RequestName']: continue
            if member["RequestDate"] < wl["RequestDate"]:
                continue
            if member["RequestStatus"] in ["None", None]:
                continue
            ## then set force complete all members
            if member["RequestStatus"] in ["running-opened", "running-closed"]:
                print "setting", member["RequestName"], "force-complete"
                reqMgrClient.setWorkflowForceComplete(url, member["RequestName"])
Esempio n. 5
0
def injector(url, options, specific):

    ## passing a round of invalidation of what needs to be invalidated
    if options.invalidate:
        invalidator(url)

    workflows = getWorkflows(url, status=options.wmstatus,user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting",wf
            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()


    existing = [wf.name for wf in session.query(Workflow).all()]


    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        if len(familly)==1:
            print wf.name,"ERROR has no replacement"
            continue
        print wf.name,"has",len(familly),"familly members"
        for member in familly:
            if member != wf.name:
                fwl = getWorkLoad(url , member)
                if options.replace:
                    if member != options.replace: continue
                else:
                    if fwl['RequestDate'] < wl['RequestDate']: continue
                    if fwl['RequestType']=='Resubmission': continue
                    if fwl['RequestStatus'] in ['None',None]: continue

                new_wf = session.query(Workflow).filter(Workflow.name == member).first()
                if not new_wf:
                    print "putting",member
                    status = 'away'
                    if fwl['RequestStatus'] in ['assignment-approved']:
                        status = 'considered'
                    new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                    wf.status = 'forget'
                    session.add( new_wf ) 
                    session.commit()
                else:
                    if new_wf.status == 'forget': continue
                    print "getting",new_wf.name,"as replacement of",wf.name


                for tr in session.query(Transfer).all():
                    if wf.id in tr.workflows_id:
                        sw = copy.deepcopy(tr.workflows_id)
                        sw.remove( wf.id)
                        sw.append(new_wf.id)
                        tr.workflows_id = sw
                        print tr.phedexid,"got",new_wf.name
                        if new_wf.status != 'away':
                            new_wf.status = 'staging'
                        session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Esempio n. 6
0
def invalidator(url, invalid_status='INVALID'):
    use_mcm = True
    up = componentInfo(mcm=use_mcm)
    if not up.check(): return
    mcm = McMClient(dev=False)

    invalids = mcm.getA('invalidations', query='status=announced')
    print len(invalids), "Object to be invalidated"
    text_to_batch = defaultdict(str)
    text_to_request = defaultdict(str)
    for invalid in invalids:
        acknowledge = False
        pid = invalid['prepid']
        batch_lookup = invalid['prepid']
        text = ""
        if invalid['type'] == 'request':
            wfn = invalid['object']
            print "need to invalidate the workflow", wfn
            wfo = session.query(Workflow).filter(Workflow.name == wfn).first()
            if wfo:
                ## set forget of that thing (although checkor will recover from it)
                print "setting the status of", wfo.status, "to forget"
                wfo.status = 'forget'
                session.commit()
            else:
                ## do not go on like this, do not acknoledge it
                print wfn, "is set to be rejected, but we do not know about it yet"
                #continue
            wfi = workflowInfo(url, wfn)
            success = "not rejected"
            ## to do, we should find a way to reject the workflow and any related acdc
            success = reqMgrClient.invalidateWorkflow(
                url, wfn, current_status=wfi.request['RequestStatus'])
            ## need to find the whole familly and reject the whole gang
            familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
            for fwl in familly:
                if fwl['RequestDate'] < wfi.request['RequestDate']: continue
                if fwl['RequestType'] != 'Resubmission': continue
                print "rejecting", fwl['RequestName']
                success = reqMgrClient.invalidateWorkflow(
                    url,
                    fwl['RequestName'],
                    current_status=fwl['RequestStatus'])

            print success
            acknowledge = True
            text = "The workflow %s (%s) was rejected due to invalidation in McM" % (
                wfn, pid)
            batch_lookup = wfn  ##so that the batch id is taken as the one containing the workflow name
        elif invalid['type'] == 'dataset':
            dataset = invalid['object']

            if '?' in dataset: continue
            if 'None' in dataset: continue
            if 'None-' in dataset: continue
            if 'FAKE-' in dataset: continue

            print "setting", dataset, "to", invalid_status
            success = setDatasetStatus(dataset, invalid_status)
            if success:
                acknowledge = True
                text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % (
                    dataset, pid)
            else:
                print "invalidation of", dataset, "did not go so well"
        else:
            print "\t\t", invalid['type'], " type not recognized"

        if acknowledge:
            ## acknoldge invalidation in mcm, provided we can have the api
            print "acknowledgment to mcm"
            mcm.get('/restapi/invalidations/acknowledge/%s' % (invalid['_id']))
            # prepare the text for batches
            batches = []
            batches.extend(
                mcm.getA('batches', query='contains=%s' % batch_lookup))
            batches = filter(
                lambda b: b['status'] in ['announced', 'done', 'reset'],
                batches)
            if len(batches):
                bid = batches[-1]['prepid']
                print "batch nofication to", bid
                text_to_batch[bid] += text + "\n\n"
            # prepare the text for requests
            text_to_request[pid] += text + "\n\n"

    for bid, text in text_to_batch.items():
        if not text: continue
        text += '\n This is an automated message'
        mcm.put('/restapi/batches/notify', {"notes": text, "prepid": bid})
        pass
    for pid, text in text_to_request.items():
        if not text: continue
        text += '\n This is an automated message'
        mcm.put('/restapi/requests/notify', {
            "message": text,
            "prepids": [pid]
        })
Esempio n. 7
0
def injector(url, options, specific):
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    UC = unifiedConfiguration()

    transform_keywords = UC.get('convert_to_stepchain')

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    for user in UC.get("user_rereco"):
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) 
    for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) :
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) 
    for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) :
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults"))

    print len(workflows),"in line"
    cannot_inject = set()
    to_convert = set()
    status_cache = defaultdict(str)

    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue

        exists = session.query(Workflow).filter(Workflow.name == wf ).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend( getWorkflowById( url, pid, details=True) )
                    
                familly = []
                print len(req_familly),"members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url, req_member['RequestName'], request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() )

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']:
                        wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ))
                        sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical')
                        print "Should not put",wf,"because of",lwfo.name,lwfo.status
                        cannot_inject.add( wf )
                        can_add = False
            ## add a check on validity of input datasets
            _,prim,par,sec = wfi.getIO()
            for d in list(prim)+list(par)+list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d]))
                    sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical')
                    can_add = False
                #else:
                #    ##make sure that all blocks get closed
                #    closeAllBlocks(url, d)

                ## check for any file in phedex, to verify existence
                _,ph_files,_,_ = getDatasetFiles(url, d)
                if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ):
                    wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d )
                    sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical')
                    can_add = False

            ### ban some workflow that you don't like anymore
            #outputs = wfi.request['OutputDatasets']



            if not can_add: continue

            ## temporary hack to transform specific taskchain into stepchains
            good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords)
            #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) 


            ## match keywords and technical constraints
            if (not options.no_convert) and good_for_stepchain and not wfi.isRelval():
                to_convert.add( wf )
                wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf)
                sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf)

            wfi.sendLog('injector',"considering %s"%wf)

            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass
    

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical')
        
    for wf in to_convert:
        os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf)

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    #print "getting all transfers"
    #all_transfers=session.query(Transfer).all()
    #print "go!"

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name )
        wl = wfi.request #getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None,'new']: continue
                if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            if wfi.isRelval():
                #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.')
                wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget')
                wf.status = 'forget'
                session.commit()
            else:
                wfi.sendLog('injector','the workflow was found in trouble with no replacement')
                no_replacement.add( wf.name )
            continue
        else:
            wfi.sendLog('injector','the workflow was found in trouble and has a replacement')
                    
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly)>1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',"putting %s as replacement of %s"%( member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name ))
                wf.status = 'forget'

            for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all():
                ## get all transfer working for the old workflow
                existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all()
                tr.active = False ## disable the old one
                if not existing:
                    ## create the transfer object for the new dependency
                    tri = TransferImp( phedexid = tr.phedexid,
                                       workflow = new_wf)
                    session.add( tri )
                session.commit()


        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
Esempio n. 8
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend(
        getWorkflows(url,
                     status=options.wmstatus,
                     user='******',
                     rtype="ReReco")
    )  ## regardless of users, pick up all ReReco on the table

    print len(workflows), "in line"
    cannot_inject = set()
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue
        exists = session.query(Workflow).filter(Workflow.name == wf).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            #wl = getWorkLoad(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            #            print wfi.request
            familly = session.query(Workflow).filter(
                Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                #req_familly = getWorkflowById( url, wl['PrepID'])
                #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly]
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend(getWorkflowById(url, pid, details=True))

                familly = []
                print len(req_familly), "members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url,
                                        req_member['RequestName'],
                                        request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend(
                            session.query(Workflow).filter(
                                Workflow.name ==
                                req_member['RequestName']).all())

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in [
                            'forget', 'trouble', 'forget-unlock',
                            'forget-out-unlock'
                    ]:
                        sendLog(
                            'injector', "Should not put %s because of %s %s" %
                            (wf, lwfo.name, lwfo.status))
                        print "Should not put", wf, "because of", lwfo.name, lwfo.status
                        cannot_inject.add(wf)
                        can_add = False
            if not can_add: continue
            wfi.sendLog('injector', "considering %s" % wf)
            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog(
            'injector',
            'These workflow cannot be added in because of duplicates \n\n %s' %
            ('\n'.join(cannot_inject)),
            level='warning')

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name)
        wl = wfi.request  #getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType'] == 'Resubmission': continue
                if fwl['RequestStatus'] in ['None', None, 'new']: continue
                if fwl['RequestStatus'] in [
                        'rejected', 'rejected-archived', 'aborted',
                        'aborted-archived'
                ]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble with no replacement')
            no_replacement.add(wf.name)
            continue
        else:
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble and has a replacement')

        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly) > 1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector',
                    'Multiple wf in line, will take the last one for %s \n%s' %
                    (wf.name, ', '.join(fwl['RequestName']
                                        for fwl in true_familly)),
                    level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(
                Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',
                        "putting %s as replacement of %s" % (member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow(name=member,
                                  status=status,
                                  wm_status=fwl['RequestStatus'])
                wf.status = 'forget'
                session.add(new_wf)
            else:
                if new_wf.status == 'forget': continue
                sendLog(
                    'injector',
                    "getting %s as replacement of %s" % (new_wf.name, wf.name))
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove(wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid, "got", new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid < 0:  ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector',
                'workflow with no replacement, %s \n are dangling there' %
                ('\n'.join(no_replacement)),
                level='critical')
Esempio n. 9
0
])
may_have_one.update([
    wfo.name for wfo in session.query(Workflow).filter(
        Workflow.status.startswith('assistance')).all()
])

wfs = []
wfs.extend(getWorkflows(url, 'running-open', details=True))
wfs.extend(getWorkflows(url, 'running-closed', details=True))
wfs.extend(getWorkflows(url, 'completed', details=True))

may_have_one_too = set()
for wf in wfs:
    if wf['RequestName'] in may_have_one:
        #print wf['RequestName'],"and familly"
        may_have_one_too.update(getWorkflowById(url, wf['PrepID']))

may_have_one.update(may_have_one_too)

## keep all relval reports for *ever* ...
batches = json.loads(open('%s/batches.json' % base_eos_dir).read())
for b, pids in batches.items():
    for pid in pids:
        wfs = getWorkflowById(url, pid, details=True)
        for wf in wfs:
            ## check on the announce date
            announced = filter(
                lambda o: o['Status'] in ['announced', 'rejected', 'aborted'],
                wf['RequestTransition'])  ## check on any final state
            if announced:
                announced_time = max([a['UpdateTime'] for a in announced])
Esempio n. 10
0
def rejector(url, specific, options=None):

    up = componentInfo(soft=['wtc'])
    if not up.check(): return

    if specific and specific.startswith('/'):
        ## this is for a dataset
        print setDatasetStatus(specific, 'INVALID')
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend(
                session.query(Workflow).filter(
                    Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
        if not wfs:
            batches = json.loads(open('batches.json').read())
            for bname in batches:
                if specific == bname:
                    for wf in batches[bname]:
                        wfs.append(
                            session.query(Workflow).filter(
                                Workflow.name == wf).first())
    else:
        wfs = session.query(Workflow).filter(
            Workflow.status == 'assistance-clone').all()
        #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all())
        ## be careful then on clone case by case
        options.clone = True
        print "not supposed to function yet"
        return

    print len(wfs), "to reject"

    if len(wfs) > 1:
        print "\n".join([wfo.name for wfo in wfs])
        answer = raw_input('Reject these')
        if not answer.lower() in ['y', 'yes']:
            return

    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject", spec
            return
        results = []
        wfi = workflowInfo(url, wfo.name)

        datasets = set(wfi.request['OutputDatasets'])
        reqMgrClient.invalidateWorkflow(
            url, wfo.name, current_status=wfi.request['RequestStatus'])

        comment = ""
        if options.comments: comment = ", reason: " + options.comments
        wfi.sendLog(
            'rejector',
            'invalidating the workflow by unified operator%s' % comment)
        ## need to find the whole familly and reject the whole gang
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        for fwl in familly:
            if fwl['RequestDate'] < wfi.request['RequestDate']: continue
            if fwl['RequestType'] != 'Resubmission': continue
            ## does not work on second order acd
            #if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue
            print "rejecting", fwl['RequestName']
            reqMgrClient.invalidateWorkflow(
                url,
                fwl['RequestName'],
                current_status=fwl['RequestStatus'],
                cascade=False)
            datasets.update(fwl['OutputDatasets'])

        for dataset in datasets:
            if options.keep:
                print "keeping", dataset, "in its current status"
            else:
                results.append(setDatasetStatus(dataset, 'INVALID'))
                pass

        if all(map(lambda result: result in ['None', None, True], results)):
            print wfo.name, "and", datasets, "are rejected"
            if options and options.clone:
                wfo.status = 'trouble'
                session.commit()
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(
                        schema['ProcessingVersion']
                    ) + 1  ## dubious str->int conversion
                else:
                    schema['ProcessingVersion'] = 2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    if schema['RequestType'] == 'TaskChain':
                        it = 1
                        while True:
                            t = 'Task%d' % it
                            it += 1
                            if t in schema:
                                schema[t]['Memory'] = options.Memory
                            else:
                                break
                    else:
                        schema['Memory'] = options.Memory

                if options.Multicore:
                    ## to do : set it properly in taskchains
                    if schema['RequestType'] == 'TaskChain':
                        tasks, set_to = options.Multicore.split(
                            ':') if ':' in options.Multicore else (
                                "", options.Multicore)
                        set_to = int(set_to)
                        tasks = tasks.split(',') if tasks else ['Task1']
                        it = 1
                        while True:
                            tt = 'Task%d' % it
                            it += 1
                            if tt in schema:
                                tname = schema[tt]['TaskName']
                                if tname in tasks or tt in tasks:
                                    mem = schema[tt]['Memory']
                                    mcore = schema[tt].get('Multicore', 1)
                                    factor = (set_to / float(mcore))
                                    fraction_constant = 0.4
                                    mem_per_core_c = int(
                                        (1 - fraction_constant) * mem /
                                        float(mcore))
                                    print "mem per core", mem_per_core_c
                                    print "base mem", mem
                                    ## adjusting the parameter in the clone
                                    schema[tt]['Memory'] += (
                                        set_to - mcore) * mem_per_core_c
                                    schema[tt]['Multicore'] = set_to
                                    schema[tt]['TimePerEvent'] /= factor
                            else:
                                break
                    else:
                        schema['Multicore'] = options.Multicore
                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup'] = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int, options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] = options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1, ntask - 1):
                        schema['Task%d' % it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask - 1
                    schema.pop('Task%d' % ntask)

                if options.priority:
                    schema['RequestPriority'] = options.priority

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                schema = reqMgrClient.purgeClonedSchema(schema)

                print "submitting"
                if (options.to_stepchain
                        and (schema['RequestType'] == 'TaskChain')):
                    ## transform the schema into StepChain schema
                    print "Transforming a TaskChain into a StepChain"
                    schema['RequestType'] = 'StepChain'
                    schema['StepChain'] = schema.pop('TaskChain')
                    schema['SizePerEvent'] = 0
                    schema['TimePerEvent'] = 0
                    step = 1
                    s_n = {}
                    while True:
                        if 'Task%d' % step in schema:
                            schema['Step%d' % step] = schema.pop('Task%d' %
                                                                 step)
                            schema['Step%d' % step]['StepName'] = schema[
                                'Step%d' % step].pop('TaskName')
                            s_n[schema['Step%d' %
                                       step]['StepName']] = 'Step%d' % step
                            if 'InputTask' in schema['Step%d' % step]:
                                schema['Step%d' % step]['InputStep'] = schema[
                                    'Step%d' % step].pop('InputTask')
                            eff = 1.
                            up_s = 'Step%d' % step
                            while True:
                                ## climb up a step. supposedely already all converted
                                up_s = s_n.get(
                                    schema[up_s].get('InputStep', None), None)
                                if up_s:
                                    ## multiply with the efficiency
                                    eff *= schema[up_s].get(
                                        'FilterEfficiency', 1.)
                                else:
                                    ## or stop there
                                    break

                            if not 'KeepOutput' in schema['Step%d' % step]:
                                ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop
                                schema['Step%d' % step]['KeepOutput'] = False
                            schema['TimePerEvent'] += eff * schema[
                                'Step%d' % step].pop('TimePerEvent')
                            schema['SizePerEvent'] += eff * schema[
                                'Step%d' % step].pop('SizePerEvent')
                            step += 1
                        else:
                            break

                print json.dumps(schema, indent=2)
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    print "error in cloning", wfo.name
                    print json.dumps(schema, indent=2)
                    return
                print newWorkflow

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfi.sendLog(
                    'rejector', 'Cloned into %s by unified operator %s' %
                    (newWorkflow, comment))
                wfi.notifyRequestor('Cloned into %s by unified operator %s' %
                                    (newWorkflow, comment),
                                    do_batch=False)
            else:
                wfo.status = 'trouble' if options.set_trouble else 'forget'
                wfi.notifyRequestor('Rejected by unified operator %s' %
                                    (comment),
                                    do_batch=False)
                session.commit()

        else:
            print "error in rejecting", wfo.name, results
Esempio n. 11
0
def rejector(url, specific, options=None):

    up = componentInfo()

    if specific and specific.startswith('/'):
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        return 

    print len(wfs),"to reject"

    if len(wfs)>1:
        print "\n".join( [wfo.name for wfo in wfs] )
        answer = raw_input('Reject these')
        if not answer.lower() in ['y','yes']:
            return
        
    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject",spec
            return
        results=[]
        wfi = workflowInfo(url, wfo.name)

        datasets = set(wfi.request['OutputDatasets'])
        reqMgrClient.invalidateWorkflow(url, wfo.name, current_status=wfi.request['RequestStatus'])

        wfi.sendLog('rejector','invalidating the workflow by unified operator')
        ## need to find the whole familly and reject the whole gang
        familly = getWorkflowById( url, wfi.request['PrepID'] , details=True)
        for fwl in familly:
            if fwl['RequestDate'] < wfi.request['RequestDate']: continue
            if fwl['RequestType']!='Resubmission': continue
            if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue
            print "rejecting",fwl['RequestName']
            reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus'])
            datasets.update( fwl['OutputDatasets'] )

        for dataset in datasets:
            if options.keep:
                print "keeping",dataset,"in its current status"
            else:
                results.append( setDatasetStatus(dataset, 'INVALID') )

        #if wfi.request['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']:
        #    print 'already',wfi.request['RequestStatus']
        #    if not options.clone:
        #        wfo.status = 'forget'
        #        session.commit()
        #        continue

        if all(map(lambda result : result in ['None',None,True],results)):
            wfo.status = 'forget'
            session.commit()
            print wfo.name,"and",datasets,"are rejected"
            if options and options.clone:
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion
                else:
                    schema['ProcessingVersion']=2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    schema['Memory'] = options.Memory
                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup']  = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int,options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] =options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1,ntask-1):
                        schema['Task%d'%it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask-1
                    schema.pop('Task%d'%ntask)

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                for p in ['RequestStatus',
                          'RequestTransition',
                          'RequestorDN',
                          'RequestWorkflow',
                          'OutputDatasets',
                          'ReqMgr2Only',
                          #'Group',
                          'RequestDate',
                          #'ConfigCacheUrl',
                          'RequestName',
                          'timeStamp',
                          'SoftwareVersions',
                          'CouchURL'
                          ]:
                    if p in schema:
                        schema.pop( p )
                        #pass
                print "submitting"
                print json.dumps( schema, indent=2 )
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    print "error in cloning",wfo.name
                    print json.dumps( schema, indent=2 )
                    return 
                print newWorkflow
                #m = re.search("details\/(.*)\'",response)
                #if not m:
                #    print "error in cloning",wfo.name
                #    print response
                #    print json.dumps( schema, indent=2 )
                #    return 
                #newWorkflow = m.group(1)

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfo.status = 'trouble'
                session.commit()
                wfi.sendLog('rejector','Cloned into %s by unified operator'%( newWorkflow ))
        else:
            print "error in rejecting",wfo.name,results
Esempio n. 12
0
def checkor(url, spec=None, options=None):
    fDB = falseDB()

    wfs=[]
    if options.fetch:
        #workflows = getWorkflows(url, status='completed')
        #for wf in workflows:
        #    wfo = session.query(Workflow).filter(Workflow.name == wf ).first()
        #    if wfo:
        #        if not wfo.status in ['away','assistance']: continue
        #        wfs.append(wfo )
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )
    else:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            print wfo.name,"is already",wfo.wm_status
            wfo.status = 'close'
            session.commit()
            continue
        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']:
            ## went into trouble
            wfo.status = 'trouble'
            print wfo.name,"is in trouble",wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            print wfo.name,"not running yet"
            session.commit()
            continue
        
        
        if wfo.wm_status != 'completed':
            ## for sure move on with closeout check if in completed
            print "no need to check on",wfo.name,"in status",wfo.wm_status
            session.commit()
            continue

        session.commit()        
        sub_assistance="" # if that string is filled, there will be need for manual assistance

        is_closing = True
        ## do the closed-out checks one by one

        # tuck out DQMIO/DQM
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        for member in familly:
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                #print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False

        ## completion check
        percent_completions = {}
        event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        fractions_pass = {}
        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.
            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                print "overriding fraction to",fractions_pass[output],"for",output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

        if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            sub_assistance+='-recovery'
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 300.
            campaign = get_campaign(output, wfi)
            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign
            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
            lumi_upper_limit[output] = upper_limit
        
        if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance+='-biglumi'
            is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        vetoed_custodial_tier = ['MINIAODSIM']
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"
                        break
            ## get from the parent
            if not custodial and 'InputDataset' in wfi.request:
                parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort"
                    continue

            if not custodial:
                ## pick one at random
                custodial = SI.pick_SE()

            if custodial and not sub_assistance and not acdc:
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        custodials[custodial].append( output )
            else:
                print "cannot find a custodial for",wfo.name
            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance+="-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            for output in wfi.request['OutputDatasets']:
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output )
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output )
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                sub_assistance+='-duplicates'
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = len(acdc)

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                # set it from away/assistance* to close
                wfo.status = 'close'
                session.commit()
        else:
            print wfo.name,"needs assistance"
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            wfo.status = 'assistance'+sub_assistance
            if not options.test:
                print "setting",wfo.name,"to",wfo.status
                session.commit()

    fDB.summary()
    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low')
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Esempio n. 13
0
parser.add_option('--workflows',help='Coma separated list of workflows to recover')
def_assignoptions = '--site acdc --xrootd'
parser.add_option('--assignoptions',default=def_assignoptions, help='The options to pass to assign.py (default is %s)'%def_assignoptions)
parser.add_option('--nocreation', default=False, action = 'store_true', help='Do not inject new request')
(options,args) = parser.parse_args()

wfs=options.workflows.split(',')

for wf in wfs:
    acdcs = []

    if options.nocreation:
        wfi = workflowInfo( reqmgr_url, wf)
        pid = wfi.request['PrepID']
        familly = filter(lambda r : r['RequestStatus'] == 'assignment-approved' and 'ACDC' in r['RequestName'],
                         getWorkflowById( reqmgr_url, pid, details=True))
        acdcs = [r['RequestName'] for r in familly]
    else:
        com = './makeACDC.py --all -w %s'% wf
        print com 
        y = (raw_input('go ?') if not options.dry else 'dry') if not options.go else 'y'
        if y.lower() in ['y','yes','go','ok']:
            makeall = os.popen('./makeACDC.py --all -w %s'% wf).read()
            for s in [l.split() for l in makeall.split('\n')]:
                if len(s)!=3: continue
                if s[1]!='for' : continue
                acdcs.append( s[0] )
    print acdcs
    #sec='--secondary_x' if 'RunIISummer16DR80Premix' in wf else ''
    sec=''
    for acdc in acdcs:
Esempio n. 14
0
#!/usr/bin/env python
from utils import reqmgr_url, getWorkflowById
from JIRAClient import JIRAClient

JC = JIRAClient()
if JC:
    those = JC.find({'status': '!CLOSED'})
    for t in those:
        s = t.fields.summary
        s = s.replace('issues', '')
        s = s.strip()
        if s.count(' ') != 0: continue
        print s
        wfs = getWorkflowById(reqmgr_url, s, details=True)
        statuses = set([r['RequestStatus'] for r in wfs])
        check_against = [
            'assignment-approved', 'running-open', 'running-closed',
            'completed', 'acquired', 'staging', 'staged', 'assigned',
            'closed-out', 'failed'
        ]
        if statuses:
            if all([s not in check_against for s in statuses]):
                print t.key, "can be closed"
                print statuses
                JC.close(t.key)  ## uncomment to close JIRAs
                continue
        print t.key, statuses
Esempio n. 15
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo( mcm = use_mcm, soft=['mcm'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table

    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting",wf
            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(1)


    existing = [wf.name for wf in session.query(Workflow).all()]

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)


    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None]: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            print wf.name,"ERROR has no replacement"
            known = []
            try:
                known = json.loads(open('no_replacement.json').read())
            except:
                pass
            if not wf.name in known:
                sendEmail('workflow in %s with no replacement'%(wl['RequestStatus']),'%s is dangling there'%(wf.name))
                known.append( wf.name )
                open('no_replacement.json','w').write( json.dumps( known, indent=2 ))
            continue
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        for fwl in true_familly:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                print "putting",member,"as replacement of",wf.name
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                print "getting",new_wf.name,"as replacement of",wf.name
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove( wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid,"got",new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid<0: ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Esempio n. 16
0
def rejector(url, specific, options=None):

    
    up = componentInfo(soft=['wtc','jira'])
    if not up.check(): return

    if specific and specific.startswith('/'):
        ## this is for a dataset
        print setDatasetStatus(specific, 'INVALID')
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
        if not wfs:
            batches = batchInfo().content()
            for bname in batches:
                if specific == bname:
                    for pid in batches[bname]:
                        b_wfs = getWorkflowById(url, pid)
                        for wf in b_wfs:
                            wfs.append( session.query(Workflow).filter(Workflow.name == wf).first())
                    break
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'assistance-clone').all()
        #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all())
        ## be careful then on clone case by case
        options.clone = True
        print "not supposed to function yet"
        return 

    print len(wfs),"to reject"

    if len(wfs)>1:
        print "\n".join( [wfo.name for wfo in wfs] )
        answer = raw_input('Reject these')
        if not answer.lower() in ['y','yes']:
            return
        
    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject",spec
            return
        wfi = workflowInfo(url, wfo.name)

        comment=""
        if options.comments: comment = ", reason: "+options.comments
        if options.keep: 
            wfi.sendLog('rejector','invalidating the workflow by unified operator%s'%comment)
        else:
            wfi.sendLog('rejector','invalidating the workflow and outputs by unified operator%s'%comment)

        results = invalidate(url, wfi, only_resub=True, with_output= (not options.keep))

        if all(results):
            print wfo.name,"rejected"
            if options and options.clone:
                wfo.status = 'trouble'
                session.commit()                
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion
                else:
                    schema['ProcessingVersion']=2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    if schema['RequestType'] == 'TaskChain':
                        it=1
                        while True:
                            t = 'Task%d'%it
                            it+=1
                            if t in schema:
                                schema[t]['Memory'] = options.Memory
                            else:
                                break
                    else:
                        schema['Memory'] = options.Memory
                        
                if options.Multicore:
                    ## to do : set it properly in taskchains
                    if schema['RequestType'] == 'TaskChain':
                        tasks,set_to = options.Multicore.split(':') if ':' in options.Multicore else ("",options.Multicore)
                        set_to = int(set_to)
                        tasks = tasks.split(',') if tasks else ['Task1']
                        it = 1 
                        while True:
                            tt = 'Task%d'% it
                            it+=1
                            if tt in schema:
                                tname = schema[tt]['TaskName']
                                if tname in tasks or tt in tasks:
                                    mem = schema[tt]['Memory']
                                    mcore = schema[tt].get('Multicore',1)
                                    factor = (set_to / float(mcore))
                                    fraction_constant = 0.4
                                    mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                                    print "mem per core", mem_per_core_c
                                    print "base mem", mem
                                    ## adjusting the parameter in the clone
                                    schema[tt]['Memory'] += (set_to-mcore)*mem_per_core_c
                                    schema[tt]['Multicore'] = set_to
                                    schema[tt]['TimePerEvent'] /= factor
                            else:
                                break
                    else:
                        schema['Multicore'] = options.Multicore
                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup']  = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int,options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] =options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1,ntask-1):
                        schema['Task%d'%it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask-1
                    schema.pop('Task%d'%ntask)

                if options.priority:
                    schema['RequestPriority'] = options.priority

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                schema = reqMgrClient.purgeClonedSchema( schema )

                print "submitting"
                if (options.to_stepchain and (schema['RequestType']=='TaskChain')):
                    ## transform the schema into StepChain schema
                    print "Transforming a TaskChain into a StepChain"
                    mcore = 0
                    mem = 0
                    schema['RequestType'] = 'StepChain'
                    schema['StepChain'] = schema.pop('TaskChain')
                    schema['SizePerEvent'] = 0
                    schema['TimePerEvent'] = 0
                    step=1
                    s_n = {}
                    while True:
                        if 'Task%d'%step in schema:
                            sname = 'Step%d'%step
                            schema[sname] = schema.pop('Task%d'%step)
                            tmcore = schema[sname].pop('Multicore')
                            tmem = schema[sname].pop('Memory')
                            if mcore and tmcore != mcore:
                                wfi.sendLog('rejector','the conversion to stepchain encoutered different value of Multicore %d != %d'%( tmcore, mcore))
                                sendLog('rejector','the conversion of %s to stepchain encoutered different value of Multicore %d != %d'%( wfo.name, tmcore, mcore), level='critical')
                            mcore = max(mcore, tmcore)
                            mem = max(mem, tmem)
                            schema[sname]['StepName'] = schema[sname].pop('TaskName')
                            s_n[ schema[sname]['StepName'] ] = sname
                            if 'InputTask' in schema[sname]:
                                schema[sname]['InputStep'] = schema[sname].pop('InputTask')
                            eff = 1.
                            up_s = sname
                            while True:
                                ## climb up a step. supposedely already all converted
                                up_s = s_n.get(schema[up_s].get('InputStep',None),None)
                                if up_s:
                                    ## multiply with the efficiency
                                    eff *= schema[up_s].get('FilterEfficiency',1.)
                                else:
                                    ## or stop there
                                    break

                            if not 'KeepOutput' in schema[sname]:
                                ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop
                                schema[sname]['KeepOutput'] = False
                            schema['TimePerEvent'] += eff*schema[sname].pop('TimePerEvent')
                            schema['SizePerEvent'] += eff*schema[sname].pop('SizePerEvent')
                            step+=1
                        else:
                            break
                    schema['Multicore'] = mcore
                    schema['Memory'] = mem
                print json.dumps( schema, indent=2 )
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    msg = "Error in cloning {}".format(wfo.name)
                    print(msg)
                    wfi.sendLog('rejector',msg)
                          
                    # Get the error message
                    time.sleep(5)
                    data = reqMgrClient.requestManagerPost(url, "/reqmgr2/data/request", schema)
                    wfi.sendLog('rejector',data)
                    
                    print json.dumps( schema, indent=2 )
                    return 
                print newWorkflow

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfi.sendLog('rejector','Cloned into %s by unified operator %s'%( newWorkflow, comment ))
                wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False)
            else:
                wfo.status = 'trouble' if options.set_trouble else 'forget' 
                wfi.notifyRequestor('Rejected by unified operator %s'%( comment ),do_batch=False)
                session.commit()

        else:
            msg = "Error in rejecting {}: {}".format(wfo.name,results)
            print(msg)
            wfi.sendLog('rejector',msg)
Esempio n. 17
0
def checkor(url, spec=None, options=None):
    fDB = closeoutInfo()

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.fetch:
        ## get all in running and check
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )
    if options.nofetch:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    by_passes = []
    holdings = []
    for bypassor,email in [('jbadillo','*****@*****.**'),('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            print "no file",bypass_file
            continue
        try:
            by_passes.extend( json.loads(open(bypass_file).read()))
        except:
            print "cannot get by-passes from",bypass_file,"for",bypassor
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            print "no file",holding_file
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            print "cannot get holdings from",holding_file,"for",bypassor
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])


    total_running_time = 5.*60. 
    sleep_time = max(0.5, total_running_time / len(wfs))

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        print "checking on",wfo.name
        
        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            print wfo.name,"is already",wfo.wm_status
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            print wfo.name,"is in trouble",wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            print wfo.name,"not running yet"
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings:
                print wfo.name,"on hold"
                continue
        if wfo.name in holdings:
            wfo.status = 'assistance-onhold'
            print "setting",wfo.name,"on hold"
            session.commit()
            continue
        
        if wfo.wm_status != 'completed':
            ## for sure move on with closeout check if in completed
            print "no need to check on",wfo.name,"in status",wfo.wm_status
            session.commit()
            continue

        session.commit()        
        sub_assistance="" # if that string is filled, there will be need for manual assistance

        is_closing = True
        ## do the closed-out checks one by one

        ## get it from somewhere
        by_pass_checks = False
        if wfo.name in by_passes:
            print "we can bypass checks on",wfo.name
            by_pass_checks = True
        for bypass in by_passes:
            if bypass in wfo.name:
                print "we can bypass",wfo.name,"because of keyword",bypass
                by_pass_checks = True
                break

        # tuck out DQMIO/DQM
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        has_recovery_going=False
        had_any_recovery = False
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assignment-approved','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                #print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False
                has_recovery_going=True
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                had_any_recovery = True
        ## completion check
        percent_completions = {}
#        print "let's see who is crashing", wfo.name
#        print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        fractions_pass = {}
        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.
            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                print "overriding fraction to",fractions_pass[output],"for",output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if has_recovery_going:
                sub_assistance+='-recovering'
            elif had_any_recovery:
                ## we want to have this looked at
                sub_assistance+='-manual'
            else:
                sub_assistance+='-recovery'
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = get_campaign(output, wfi)
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
        
        if any([ events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance+='-biglumi'
            is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        vetoed_custodial_tier = ['MINIAODSIM']
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"
                        break
            ## get from the parent
            pick_custodial = True
            if not custodial and 'InputDataset' in wfi.request:
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( wfi.request['InputDataset'])
                ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort"
                    sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%wfi.request['InputDataset'])
                    is_closing = False
                    pick_custodial = False

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE()

            if custodial and ((not sub_assistance and not acdc) or by_pass_checks):
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            else:
                print "cannot find a custodial for",wfo.name
            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance+="-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output )
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output )
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                sub_assistance+='-duplicates'
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        if by_pass_checks:
            ## force closing
            is_closing = True

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                print "close out answer",res
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            new_status = 'assistance'+sub_assistance
            print wfo.name,"needs assistance with",new_status
            
            if sub_assistance and wfo.status != new_status and 'PrepID' in wfi.request and not 'manual' in wfo.status:
                pid = wfi.getPrepIDs()[0].replace('task_','')
                #pid = wfi.request['PrepID'].replace('task_','')
                ## notify
                messages= {
                    'recovery' : 'Samples completed with missing lumi count:\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ) ),
                    'biglumi' : 'Samples completed with large luminosity blocks:\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])])),
                    'duplicate' : 'Samples completed with duplicated luminosity blocks:\n%s'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    }
                text ="The request %s (%s) is facing issue in production.\n" %( pid, wfo.name )
                content = ""
                for case in messages:
                    if case in new_status:
                        content+= "\n"+messages[case]+"\n"
                text += content
                text += "You are invited to check, while this is being taken care of by Ops.\n"
                text += "This is an automated message."
                if use_mcm and content:
                    print "Sending notification back to requestor"
                    print text
                    batches = mcm.getA('batches',query='contains=%s&status=announced'%pid)
                    if len(batches):
                        ## go notify the batch
                        bid = batches[-1]['prepid']
                        print "batch nofication to",bid
                        mcm.put('/restapi/batches/notify', { "notes" : text, "prepid" : bid})


                    ## go notify the request
                    print "request notification to",pid
                    mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid] })

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"


    fDB.html()

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Esempio n. 18
0
def actor(url, options=None):

    if moduleLock(wait=False, silent=True)(): return
    if userLock('actor'): return

    up = componentInfo(soft=['mcm'])
    if not up.check(): return

    # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    WC = wtcClient()

    action_list = WC.get_actions()
    if action_list is None:
        print "Not able to load action list"
        sendLog('actor', 'Not able to load action list', level='critical')
        return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps(action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps(sorted(wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle(wf_list)
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-' * 100
        print "Looking at", wfname, "for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get('Parameters', None)
        to_acdc = action_list[wfname].get('Action', None) == 'acdc'
        to_clone = action_list[wfname].get('Action', None) == 'clone'
        to_force = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['onhold', 'on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog(
                'actor',
                'Action submitted for something other than acdc, clone, bypass or hold for workflow %s'
                % wfname,
                level='critical')
            print json.dumps(action_list[wfname], indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor',
                    'Empty action submitted for workflow %s' % wfname,
                    level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

        #===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor', 'Going to clone %s' % wfname)
            results = []
            datasets = set(wfi.request['OutputDatasets'])

            comment = ""

            if 'comment' in tasks: comment = ", reason: " + tasks['comment']
            wfi.sendLog(
                'actor',
                "invalidating the workflow by traffic controller %s" % comment)

            #Reject all workflows in the family
            #first reject the original workflow.
            reqMgrClient.invalidateWorkflow(
                url,
                wfi.request['RequestName'],
                current_status=wfi.request['RequestStatus'],
                cascade=False)
            #Then reject any ACDCs associated with that workflow
            family = getWorkflowById(url, wfi.request['PrepID'], details=True)
            for fwl in family:
                print "rejecting", fwl['RequestName'], fwl['RequestStatus']
                wfi.sendLog(
                    'actor', "rejecting %s, previous status %s" %
                    (fwl['RequestName'], fwl['RequestStatus']))
                reqMgrClient.invalidateWorkflow(
                    url,
                    fwl['RequestName'],
                    current_status=fwl['RequestStatus'],
                    cascade=False)
                datasets.update(fwl['OutputDatasets'])
            #Invalidate all associated output datasets
            for dataset in datasets:
                results.append(setDatasetStatus(dataset, 'INVALID'))

            if all(map(lambda result: result in ['None', None, True],
                       results)):
                wfi.sendLog('actor', "%s and children are rejected" % wfname)

            cloned = None
            try:
                cloned = singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog(
                    'actor',
                    'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'
                    % wfname,
                    level='critical')
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
            if not cloned:
                recover = False
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                sendLog('actor',
                        'Failed to create clone for %s!' % wfname,
                        level='critical')

            else:
                wfi.sendLog('actor', "Workflow %s cloned" % wfname)

#===========================================================
        elif to_force:
            wfi.sendLog('actor',
                        'Bypassing from workflow traffic controler request')
            forcing = json.loads(
                open(
                    '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json'
                ).read())
            forcing.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json',
                 'w').write(json.dumps(sorted(set(forcing))))
        elif to_hold:
            wfi.sendLog('actor',
                        'Holding on workflow traffic controler request')
            holding = json.loads(
                open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json').
                read())
            holding.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json',
                 'w').write(json.dumps(sorted(set(holding))))
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                            tasks[task].append(
                                {setting: allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because WMErr cannot be reached.'
                    % wfname,
                    level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo(
                )
                print "Where to run = "
                print where_to_run
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because recovery info cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because site list cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""

            num_tasks_to_recover = 0

            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for", wfname
#            recover = False

            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
                ## we do not try to recover pLHE
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because it is a pLHE workflow.'
                    % wfname,
                    level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)

#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
#   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                #                print "Full task name is " + fulltaskname
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in [
                                'Processing', 'Production', 'Merge'
                        ]:
                            wrong_task = True
                            wfi.sendLog(
                                'actor',
                                "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"
                                % (fulltaskname, task_info.taskType))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites = [SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites = list(
                                set([
                                    SI.SE_to_CE(site)
                                    for site in actions[action]
                                ]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(
                        set([SI.SE_to_CE(site)
                             for site in where_to_run[task]]))
                    print "Found", sorted(
                        assign_to_sites
                    ), "as sites where to run the ACDC at, from the acdc doc of ", wfname
                print "Going to run at", sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url,
                                          fulltaskname,
                                          wfi.request,
                                          actions,
                                          do=options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)),
                                    level='critical')
                                wfi.sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
                            #                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else:  #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog(
                            'actor',
                            'ACDC created for task %s. Actions taken \n%s' %
                            (fulltaskname, json.dumps(actions)))
                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters = {
                            'SiteWhitelist': sorted(assign_to_sites),
                            'AcquisitionEra': wfi.acquisitionEra(),
                            'ProcessingString': wfi.processingString(),
                            'MergedLFNBase': wfi.request['MergedLFNBase'],
                            'ProcessingVersion':
                            wfi.request['ProcessingVersion'],
                        }
                        ## hackery for ACDC merge assignment
                        if wfi.request[
                                'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                                    '/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request
                              and wfi.request['TrustSitelists'] == 'true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                    'TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                'TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC", acdc
                            parameters['execute'] = True
                            wfi.sendLog('actor',
                                        "%s  was assigned for recovery" % acdc)
                        else:
                            print "no assignment done with this ACDC", acdc
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                            continue

#                       print parameters
                        result = reqMgrClient.assignWorkflow(
                            url, acdc, team, parameters)
                        if not result:
                            print acdc, "was not assigned"
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                        else:
                            recovering.add(acdc)
                        wfi.sendLog('actor', "ACDCs created for %s" % wfname)
        #===========================================================

        if recover and options.do:
            r = WC.remove_action(wfname)
            if not r:
                sendLog(
                    'actor',
                    'not able to remove the action, interlocking the module',
                    level='critical')
                os.system('touch %s/actor.failed-%s.lock' %
                          (base_eos_dir, os.getpid()))
                sys.exit(-1)

        if message_to_user:
            print wfname, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')

    return
Esempio n. 19
0
import pickle
import sys

url = 'cmsweb.cern.ch'


pid = 'ReReco-Run2015D-16Dec2015-0009'
if len(sys.argv) > 1:
    pid = sys.argv[1]

fetch=False
if len(sys.argv)>2:
    fetch = bool(int(sys.argv[2]))
    print "setting fetch to",fetch

wfs = getWorkflowById( url, pid , details=True)
if not wfs:
    print "no workflow for",pid
    sys.exit(-1)

in_dataset = None
input_json = defaultdict(list)
input_rl = []
output_json = {}
output_rl = defaultdict(list)
missing_rl = defaultdict(list)
errors_by_lb = defaultdict(lambda : defaultdict(set))
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
ecode_ban = []#99999,139,134,92]

## try to get more workflows by their outputs
Esempio n. 20
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=["mcm"])
    if not up.check():
        return
    use_mcm = up.status["mcm"]

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting", wf
            new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(1)

    existing = [wf.name for wf in session.query(Workflow).all()]

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == "trouble").all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl["PrepID"])
        true_familly = []
        for member in familly:
            if member == wf.name:
                continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace:
                    continue
            else:
                if fwl["RequestDate"] < wl["RequestDate"]:
                    continue
                if fwl["RequestType"] == "Resubmission":
                    continue
                if fwl["RequestStatus"] in ["None", None]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            print wf.name, "ERROR has no replacement"
            known = []
            try:
                known = json.loads(open("no_replacement.json").read())
            except:
                pass
            if not wf.name in known:
                sendEmail(
                    "workflow in %s with no replacement" % (wl["RequestStatus"]), "%s is dangling there" % (wf.name)
                )
                known.append(wf.name)
                open("no_replacement.json", "w").write(json.dumps(known, indent=2))
            continue
        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        for fwl in true_familly:
            member = fwl["RequestName"]
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                print "putting", member, "as replacement of", wf.name
                status = "away"
                if fwl["RequestStatus"] in ["assignment-approved"]:
                    status = "considered"
                new_wf = Workflow(name=member, status=status, wm_status=fwl["RequestStatus"])
                wf.status = "forget"
                session.add(new_wf)
            else:
                if new_wf.status == "forget":
                    continue
                print "getting", new_wf.name, "as replacement of", wf.name
                wf.status = "forget"

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove(wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid, "got", new_wf.name
                    if new_wf.status != "away":
                        print "\t setting it considered"
                        new_wf.status = "considered"
                    session.commit()

        ## don't do that automatically
        # wf.status = 'forget'
        session.commit()
Esempio n. 21
0
def rejector(url, specific, options=None):

    up = componentInfo(soft=['wtc', 'jira'])
    #if not up.check(): return

    if specific and specific.startswith('/'):
        ## this is for a dataset
        print setDatasetStatus(specific, 'INVALID')
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend(
                session.query(Workflow).filter(
                    Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
        if not wfs:
            batches = batchInfo().content()
            for bname in batches:
                if specific == bname:
                    for pid in batches[bname]:
                        b_wfs = getWorkflowById(url, pid)
                        for wf in b_wfs:
                            wfs.append(
                                session.query(Workflow).filter(
                                    Workflow.name == wf).first())
                    break
    else:
        wfs = session.query(Workflow).filter(
            Workflow.status == 'assistance-clone').all()
        #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all())
        ## be careful then on clone case by case
        options.clone = True
        print "not supposed to function yet"
        return

    print len(wfs), "to reject"

    if len(wfs) > 1:
        print "\n".join([wfo.name for wfo in wfs])
        answer = raw_input('Reject these')
        if not answer.lower() in ['y', 'yes']:
            return

    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject", spec
            return
        wfi = workflowInfo(url, wfo.name)

        comment = ""
        if options.comments: comment = ", reason: " + options.comments
        if options.keep:
            wfi.sendLog(
                'rejector',
                'invalidating the workflow by unified operator%s' % comment)
        else:
            wfi.sendLog(
                'rejector',
                'invalidating the workflow and outputs by unified operator%s' %
                comment)

        results = invalidate(url,
                             wfi,
                             only_resub=True,
                             with_output=(not options.keep))

        if all(results):
            print wfo.name, "rejected"
            if options and options.clone:
                wfo.status = 'trouble'
                session.commit()
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(
                        schema['ProcessingVersion']
                    ) + 1  ## dubious str->int conversion
                else:
                    schema['ProcessingVersion'] = 2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    if schema['RequestType'] == 'TaskChain':
                        it = 1
                        while True:
                            t = 'Task%d' % it
                            it += 1
                            if t in schema:
                                schema[t]['Memory'] = options.Memory
                            else:
                                break
                    else:
                        schema['Memory'] = options.Memory

                if options.short_task and schema['RequestType'] == 'TaskChain':
                    translate = {}
                    it = 1
                    while True:
                        tt = 'Task%d' % it
                        if tt in schema:
                            tname = schema[tt]['TaskName']
                            ntname = 'T%d' % it
                            translate[tname] = ntname
                            it += 1
                            schema[tt]['TaskName'] = ntname
                            if 'InputTask' in schema[tt]:
                                itname = schema[tt]['InputTask']
                                schema[tt]['InputTask'] = translate[itname]
                        else:
                            break
                    for k in schema.get('ProcessingString', {}).keys():
                        schema['ProcessingString'][
                            translate[k]] = schema['ProcessingString'].pop(k)
                    for k in schema.get('AcquisitionEra', {}).keys():
                        schema['AcquisitionEra'][
                            translate[k]] = schema['AcquisitionEra'].pop(k)

                if options.Multicore:
                    ## to do : set it properly in taskchains
                    if schema['RequestType'] == 'TaskChain':
                        tasks, set_to = options.Multicore.split(
                            ':') if ':' in options.Multicore else (
                                "", options.Multicore)
                        set_to = int(set_to)
                        tasks = tasks.split(',') if tasks else ['Task1']
                        it = 1
                        while True:
                            tt = 'Task%d' % it
                            it += 1
                            if tt in schema:
                                tname = schema[tt]['TaskName']
                                if tname in tasks or tt in tasks:
                                    mem = schema[tt]['Memory']
                                    mcore = schema[tt].get('Multicore', 1)
                                    factor = (set_to / float(mcore))
                                    fraction_constant = 0.4
                                    mem_per_core_c = int(
                                        (1 - fraction_constant) * mem /
                                        float(mcore))
                                    print "mem per core", mem_per_core_c
                                    print "base mem", mem
                                    ## adjusting the parameter in the clone
                                    schema[tt]['Memory'] += (
                                        set_to - mcore) * mem_per_core_c
                                    schema[tt]['Multicore'] = set_to
                                    schema[tt]['TimePerEvent'] /= factor
                            else:
                                break
                    else:
                        schema['Multicore'] = options.Multicore
                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup'] = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int, options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] = options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1, ntask - 1):
                        schema['Task%d' % it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask - 1
                    schema.pop('Task%d' % ntask)

                if options.priority:
                    schema['RequestPriority'] = options.priority

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                schema = reqMgrClient.purgeClonedSchema(schema)

                print "submitting"
                if (options.to_stepchain
                        and (schema['RequestType'] == 'TaskChain')):
                    ## transform the schema into StepChain schema
                    print "Transforming a TaskChain into a StepChain"
                    mcore = 0
                    mem = 0
                    schema['RequestType'] = 'StepChain'
                    schema['StepChain'] = schema.pop('TaskChain')
                    schema['SizePerEvent'] = 0
                    schema['TimePerEvent'] = 0
                    step = 1
                    s_n = {}
                    while True:
                        if 'Task%d' % step in schema:
                            sname = 'Step%d' % step
                            schema[sname] = schema.pop('Task%d' % step)
                            tmcore = schema[sname].pop('Multicore')
                            tmem = schema[sname].pop('Memory')
                            if mcore and tmcore != mcore:
                                wfi.sendLog(
                                    'rejector',
                                    'the conversion to stepchain encoutered different value of Multicore %d != %d'
                                    % (tmcore, mcore))
                                sendLog(
                                    'rejector',
                                    'the conversion of %s to stepchain encoutered different value of Multicore %d != %d'
                                    % (wfo.name, tmcore, mcore),
                                    level='critical')
                            mcore = max(mcore, tmcore)
                            mem = max(mem, tmem)
                            schema[sname]['StepName'] = schema[sname].pop(
                                'TaskName')
                            s_n[schema[sname]['StepName']] = sname
                            if 'InputTask' in schema[sname]:
                                schema[sname]['InputStep'] = schema[sname].pop(
                                    'InputTask')
                            eff = 1.
                            up_s = sname
                            while True:
                                ## climb up a step. supposedely already all converted
                                up_s = s_n.get(
                                    schema[up_s].get('InputStep', None), None)
                                if up_s:
                                    ## multiply with the efficiency
                                    eff *= schema[up_s].get(
                                        'FilterEfficiency', 1.)
                                else:
                                    ## or stop there
                                    break

                            if not 'KeepOutput' in schema[sname]:
                                ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop
                                schema[sname]['KeepOutput'] = False
                            schema['TimePerEvent'] += eff * schema[sname].pop(
                                'TimePerEvent')
                            schema['SizePerEvent'] += eff * schema[sname].pop(
                                'SizePerEvent')
                            step += 1
                        else:
                            break
                    schema['Multicore'] = mcore
                    schema['Memory'] = mem
                print json.dumps(schema, indent=2)
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    msg = "Error in cloning {}".format(wfo.name)
                    print(msg)
                    wfi.sendLog('rejector', msg)

                    # Get the error message
                    time.sleep(5)
                    data = reqMgrClient.requestManagerPost(
                        url, "/reqmgr2/data/request", schema)
                    wfi.sendLog('rejector', data)

                    print json.dumps(schema, indent=2)
                    return
                print newWorkflow

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfi.sendLog(
                    'rejector', 'Cloned into %s by unified operator %s' %
                    (newWorkflow, comment))
                #wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False)
            else:
                wfo.status = 'trouble' if options.set_trouble else 'forget'
                wfi.notifyRequestor('Rejected by unified operator %s' %
                                    (comment),
                                    do_batch=False)
                session.commit()

        else:
            msg = "Error in rejecting {}: {}".format(wfo.name, results)
            print(msg)
            wfi.sendLog('rejector', msg)
Esempio n. 22
0
from collections import defaultdict
import pickle
import sys

url = 'cmsweb.cern.ch'

pid = 'ReReco-Run2015D-16Dec2015-0009'
if len(sys.argv) > 1:
    pid = sys.argv[1]

fetch = False
if len(sys.argv) > 2:
    fetch = bool(int(sys.argv[2]))
    print "setting fetch to", fetch

wfs = getWorkflowById(url, pid, details=True)
if not wfs:
    print "no workflow for", pid
    sys.exit(-1)

in_dataset = None
input_json = defaultdict(list)
input_rl = []
output_json = {}
output_rl = defaultdict(list)
missing_rl = defaultdict(list)
errors_by_lb = defaultdict(lambda: defaultdict(set))
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
ecode_ban = []  #99999,139,134,92]

## try to get more workflows by their outputs
Esempio n. 23
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock() and not options.go:  return

    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())
    
    runnings = session.query(Workflow).filter(Workflow.status == 'away').all()
    standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()

    ## intersect with what is actually in completed status in request manager now
    all_completed = set(getWorkflows(url, 'completed' ))

    wfs=[]

    if options.strict:
        ## the one which were running and now have completed
        print "strict option is on: checking workflows that freshly completed"
        wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings))
    if options.update:
        print "update option is on: checking workflows that have not completed yet"
        wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings))

    if options.clear:
        print "clear option is on: checking workflows that are ready to toggle closed-out"
        wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings))
    if options.review:
        print "review option is on: checking the workflows that needed intervention"
        wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings))

    ## what is left out are the wf which were running and ended up aborted/failed/...

    

    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False) if use_mcm else None

    def get_campaign(output, wfi):
        ## this should be a perfect matching of output->task->campaign
        campaign = None
        era = None
        wf_campaign = None
        if 'Campaign' in wfi.request:   wf_campaign = wfi.request['Campaign']
        try:
            era = output.split('/')[2].split('-')[0]
        except:
            era = None
            
        if wfi.isRelval(): 
            campaign = wf_campaign
        else:
            campaign = era if era else wf_campaign
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    
    actors = UC.get('allowed_bypass')

    for bypassor,email in actors:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            extending = json.loads(open(holding_file).read())
            print bypassor,"is holding",extending
            holdings.extend( extending )
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in actors:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        #if forcings:
        #    sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    in_manual = 0

    ## now you have a record of what file was invalidated globally from TT
    TMDB_invalid = dataCache.get('file_invalidation') 
    #try:
    #    TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))])
    #    TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid)
    #    print len(TMDB_invalid),"globally invalidated files"
    #except Exception as e:
    #    print "TMDB not fetched"
    #    print str(e)
    #    TMDB_invalid = []


    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if options.limit: max_per_round=options.limit
    if max_per_round and not spec: wfs = wfs[:max_per_round]



    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        
        time.sleep( sleep_time )
        
        time_point("Starting with %s"% wfo.name)

        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM'))
        campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c 
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        true_familly = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['PrepID'] != wfi.request['PrepID'] : continue
            #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue

            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical')
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue

            true_familly.append( member['RequestName'] )
            #try:
            #    parse_one(url, member['RequestName'])
            #except:
            #    print "Could not make error report for",member['RequestName']

            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))
            sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical')

        time_point("checked workflow familly", sub_lap=True)


        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        events_per_lumi = {}

        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        time_point("execpted statistics", sub_lap=True)

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100
                
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            default_pass = UC.get('default_fraction_pass')
            fractions_pass[output] = default_pass
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                if type(CI.campaigns[c]['fractionpass']) == dict:
                    tier = output.split('/')[-1]
                    priority = str(wfi.request['RequestPriority'])
                    ## defined per tier
                    fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass)
                    if tier in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier]
                    if priority in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority]
                else:
                    fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical')
                #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**'])
                ## do not bypass for now, until Alan understands why we are loosing ACDC docs 
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        time_point("checked output size", sub_lap=True)

        ## correct lumi < 300 event per lumi
        #for output in wfi.request['OutputDatasets']:
        #events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        time_point("checked dataset presence", sub_lap=True)

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        time_point("checked custodiality", sub_lap=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        time_point("checked phedex count", sub_lap=True)


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            group = None
            if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]:
                group = CI.campaigns[campaign]['phedex_group']
                print "using group",group,"for replica"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit")
                
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)


            if custodial and size_worht_going_to_ddm > tape_size_limit:
                print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit))
                custodial = None

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"

                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output))
                            custodials[custodial].append( output )
                            if group: custodials[custodial][-1]+='@%s'%group
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        time_point("determined tape location", sub_lap=True)

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        
        time_point("dbs file count", sub_lap=True)

        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n"
            mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n"
            mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n"
            mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n"

            wfi.sendLog('checkor',mismatch_notice)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                                                                          "\n".join( missing_phedex )))
                        were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
                            sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated),
                                                                                                                          "\n".join(were_invalidated)), level='critical')
                            dbs3Client.setFileStatus( were_invalidated, newstatus=0 )
                                
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))
                        were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False
        
        time_point("checked file count", sub_lap=True)

        fraction_invalid = 0.20
        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        files_per_rl = {}
        for output in wfi.request['OutputDatasets']:
            duplications[output] = "skiped"
            files_per_rl[output] = "skiped"

        time_point("checked invalidation", sub_lap=True)

        if (is_closing or bypass_checks) and (not options.ignoreduplicates):
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                    except Exception as e:
                        wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output))
                        sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical')
                        is_closing=False

            if is_closing and any(duplications.values()) and not options.ignoreduplicates:
                duplicate_notice = ""
                duplicate_notice += "%s has duplicates\n"%wfo.name
                duplicate_notice += json.dumps( duplications,indent=2)
                duplicate_notice += '\n'
                duplicate_notice += json.dumps( files_per_rl, indent=2)
                wfi.sendLog('checkor',duplicate_notice)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 


        time_point("checked duplicates", sub_lap=True)

        time_point("done with %s"%wfo.name)

        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            #rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            rec['familly'] = true_familly
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## make the lumi summary 
        if wfi.request['RequestType'] == 'ReReco':
            try:
                os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID']))
                os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID']))
                wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID']))
            except Exception as e:
                print str(e)
        ## make the error report
        
    
        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            wfi.sendLog('checkor',"setting %s closed-out"% wfo.name)
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            if not 'custodial' in assistance_tags or wfi.isRelval():
                ## do only the report for those
                for member in acdc+acdc_inactive+[wfo.name]:
                    try:
                        parse_one(url, member)
                    except:
                        print "Could not make error report for",member

            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that had ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')
                in_manual += 1
            if 'recovery' in assistance_tags and 'manual' in assistance_tags:
                ## this is likely because something bad is happening, so leave it to manual
                assistance_tags = assistance_tags - set(['recovery'])
                assistance_tags.add('manual')
                in_manual += 1

            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name)
                ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                perflink = '%s/report/%s'%(unified_url,wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status))
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec and in_manual!=0:
        sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        items_at = defaultdict(set)
        for i in custodials[site]:
            item, group = i.split('@') if '@' in i else (i,'DataOps')
            items_at[group].add( item )
        for group,items in items_at.items():
            print ','.join(items),'=>',site,'@',group
            if not options.test:
                result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group)
                print result

    print "File Invalidation"
    print invalidations
Esempio n. 24
0
def completor(url, specific):

    CI = campaignInfo()

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
    ##wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order
    random.shuffle(wfs)

    ## by workflow a list of fraction / timestamps
    completions = json.loads(open("completions.json").read())

    good_fractions = {}
    for c in CI.campaigns:
        if "force-complete" in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]["force-complete"]

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    for wfo in wfs:
        if specific and not specific in wfo.name:
            continue

        if not any([c in wfo.name for c in good_fractions]):
            continue

        print "looking at", wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)

        if not "Campaign" in wfi.request:
            continue
        c = wfi.request["Campaign"]
        if not c in good_fractions:
            continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.0

        lumi_expected = None
        event_expected = None
        if not "TotalInputEvents" in wfi.request:
            if "RequestNumEvents" in wfi.request:
                event_expected = wfi.request["RequestNumEvents"]
            else:
                continue
        else:
            lumi_expected = wfi.request["TotalInputLumis"]
            event_expected = wfi.request["TotalInputEvents"]

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.0)

        running_log = filter(
            lambda change: change["Status"] in ["running-open", "running-closed"], wfi.request["RequestTransition"]
        )
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]["UpdateTime"] / (60.0 * 60.0 * 24.0)
        delay = now - then  ## in days

        (w, d) = divmod(delay, 7)
        print "\t" * int(w) + "Running since", delay, "[days]"
        if delay <= 4:
            continue
        if delay >= 7:
            sendEmail("long lasting workflow", "%s has been running for %s days" % (wfo.name, delay))

        percent_completions = {}
        for output in wfi.request["OutputDatasets"]:
            if not output in completions:
                completions[output] = {"injected": None, "checkpoints": [], "workflow": wfo.name}
            ## get completion fraction
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion = 0.0
            event_completion = 0.0
            if lumi_expected:
                lumi_completion = lumi_count / float(lumi_expected)
            if event_expected:
                event_completion = event_count / float(event_expected)

            # take the less optimistic
            percent_completions[output] = min(lumi_completion, event_completion)
            completions[output]["checkpoints"].append((now, event_completion))

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            print "all is above", good_fraction, "for", wfo.name
            print json.dumps(percent_completions, indent=2)
        else:
            print "\t", percent_completions.values(), "not over bound", good_fraction
            # print json.dumps( percent_completions, indent=2 )
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in percent_completions:
            completions[output]["injected"] = then

        # further check on delays
        cpuh = wfi.getComputingTime(unit="d")

        ran_at = wfi.request["SiteWhitelist"]
        print "Required:", cpuh,
        print "Time spend:", delay

        ## find ACDCs that might be running
        familly = getWorkflowById(url, wfi.request["PrepID"], details=True)
        for member in familly:
            ### if member['RequestName'] == wl['RequestName']: continue ## set himself out
            if member["RequestDate"] < wfi.request["RequestDate"]:
                continue
            if member["RequestStatus"] in ["None", None]:
                continue
            ## then set force complete all members
            if member["RequestStatus"] in ["running-opened", "running-closed"]:
                print "setting", member["RequestName"], "force-complete"
                print "NOT REALLY FORCING"
                sendEmail(
                    "force completing",
                    "TAGGING %s is worth force completing\n%s" % (member["RequestName"], percent_completions),
                )
                ##reqMgrClient.setWorkflowForceComplete(url, member['RequestName'])

        ## do it once only for testing
        # break

    open("completions.json", "w").write(json.dumps(completions, indent=2))
Esempio n. 25
0
def injector(url, options, specific):

    ## passing a round of invalidation of what needs to be invalidated
    if options.invalidate:
        invalidator(url)

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting", wf
            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()

    existing = [wf.name for wf in session.query(Workflow).all()]

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        if len(familly) == 1:
            print wf.name, "ERROR has no replacement"
            continue
        print wf.name, "has", len(familly), "familly members"
        for member in familly:
            if member != wf.name:
                fwl = getWorkLoad(url, member)
                if options.replace:
                    if member != options.replace: continue
                else:
                    if fwl['RequestDate'] < wl['RequestDate']: continue
                    if fwl['RequestType'] == 'Resubmission': continue
                    if fwl['RequestStatus'] in ['None', None]: continue

                new_wf = session.query(Workflow).filter(
                    Workflow.name == member).first()
                if not new_wf:
                    print "putting", member
                    status = 'away'
                    if fwl['RequestStatus'] in ['assignment-approved']:
                        status = 'considered'
                    new_wf = Workflow(name=member,
                                      status=status,
                                      wm_status=fwl['RequestStatus'])
                    wf.status = 'forget'
                    session.add(new_wf)
                    session.commit()
                else:
                    if new_wf.status == 'forget': continue
                    print "getting", new_wf.name, "as replacement of", wf.name

                for tr in session.query(Transfer).all():
                    if wf.id in tr.workflows_id:
                        sw = copy.deepcopy(tr.workflows_id)
                        sw.remove(wf.id)
                        sw.append(new_wf.id)
                        tr.workflows_id = sw
                        print tr.phedexid, "got", new_wf.name
                        if new_wf.status != 'away':
                            new_wf.status = 'staging'
                        session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Esempio n. 26
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo( mcm = use_mcm, soft=['mcm'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table

    print len(workflows),"in line"
    cannot_inject = set()
    status_cache = defaultdict(str)
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue
        exists = session.query(Workflow).filter(Workflow.name == wf ).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            #wl = getWorkLoad(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            #            print wfi.request
            familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                #req_familly = getWorkflowById( url, wl['PrepID'])
                #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly]
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend( getWorkflowById( url, pid, details=True) )
                    
                familly = []
                print len(req_familly),"members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url, req_member['RequestName'], request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() )

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']:
                        sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ))
                        print "Should not put",wf,"because of",lwfo.name,lwfo.status
                        cannot_inject.add( wf )
                        can_add = False
            ## add a check on validity of input datasets
            _,prim,par,sec = wfi.getIO()
            for d in list(prim)+list(par)+list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d]))
                    sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]))
                    can_add = False
            if not can_add: continue
            wfi.sendLog('injector',"considering %s"%wf)

            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass
    
    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='warning')

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name )
        wl = wfi.request #getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None,'new']: continue
                if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            wfi.sendLog('injector','the workflow was found in trouble with no replacement')
            no_replacement.add( wf.name )
            continue
        else:
            wfi.sendLog('injector','the workflow was found in trouble and has a replacement')
                    
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly)>1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',"putting %s as replacement of %s"%( member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name ))
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove( wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid,"got",new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid<0: ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector','workflow with no replacement, %s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
Esempio n. 27
0
def checkor(url, spec=None, options=None):
    fDB = closeoutInfo()
    if userLock():
        return
    if duplicateLock():
        return

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=["mcm"])
    if not up.check():
        return
    use_mcm = up.status["mcm"]

    wfs = []
    if options.fetch:
        ## get all in running and check
        wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
        wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all())
    if options.nofetch:
        ## than get all in need for assistance
        wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all())

    custodials = defaultdict(list)  # sites : dataset list
    transfers = defaultdict(list)  # sites : dataset list
    invalidations = []  # a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split("/")[2].split("-")[0]
        except:
            if "Campaign" in wfi.request:
                campaign = wfi.request["Campaign"]
        return campaign

    by_passes = []
    holdings = []
    for bypassor, email in [
        ("jbadillo", "*****@*****.**"),
        ("vlimant", "*****@*****.**"),
        ("jen_a", "*****@*****.**"),
    ]:
        bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor)
        if not os.path.isfile(bypass_file):
            print "no file", bypass_file
            continue
        try:
            by_passes.extend(json.loads(open(bypass_file).read()))
        except:
            print "cannot get by-passes from", bypass_file, "for", bypassor
            sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email])

        holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor)
        if not os.path.isfile(holding_file):
            print "no file", holding_file
            continue
        try:
            holdings.extend(json.loads(open(holding_file).read()))
        except:
            print "cannot get holdings from", holding_file, "for", bypassor
            sendEmail(
                "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email]
            )

    total_running_time = 5.0 * 60.0
    sleep_time = max(0.5, total_running_time / len(wfs))

    for wfo in wfs:
        if spec and not (spec in wfo.name):
            continue
        time.sleep(sleep_time)
        print "checking on", wfo.name

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request["RequestStatus"]
        if wfo.wm_status == "closed-out":
            ## manually closed-out
            print wfo.name, "is already", wfo.wm_status
            wfo.status = "close"
            session.commit()
            continue

        elif wfo.wm_status in [
            "failed",
            "aborted",
            "aborted-archived",
            "rejected",
            "rejected-archived",
            "aborted-completed",
        ]:
            ## went into trouble
            wfo.status = "trouble"
            print wfo.name, "is in trouble", wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ["assigned", "acquired"]:
            ## not worth checking yet
            print wfo.name, "not running yet"
            session.commit()
            continue

        if "-onhold" in wfo.status:
            if wfo.name in holdings and wfo.name not in by_passes:
                print wfo.name, "on hold"
                continue

        if wfo.name in holdings and wfo.name not in by_passes:
            wfo.status = "assistance-onhold"
            print "setting", wfo.name, "on hold"
            session.commit()
            continue

        if wfo.wm_status != "completed" and not wfo.name in by_passes:
            ## for sure move on with closeout check if in completed
            print "no need to check on", wfo.name, "in status", wfo.wm_status
            session.commit()
            continue

        session.commit()
        sub_assistance = ""  # if that string is filled, there will be need for manual assistance

        is_closing = True

        ## get it from somewhere
        by_pass_checks = False
        if wfo.name in by_passes:
            print "we can bypass checks on", wfo.name
            by_pass_checks = True
        for bypass in by_passes:
            if bypass in wfo.name:
                print "we can bypass", wfo.name, "because of keyword", bypass
                by_pass_checks = True
                break

        if not CI.go(wfi.request["Campaign"]) and not by_pass_checks:
            print "No go for", wfo.name
            continue

        # tuck out DQMIO/DQM
        wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request["PrepID"], details=True)
        acdc = []
        acdc_inactive = []
        has_recovery_going = False
        had_any_recovery = False
        for member in familly:
            if member["RequestType"] != "Resubmission":
                continue
            if member["RequestName"] == wfo.name:
                continue
            if member["RequestDate"] < wfi.request["RequestDate"]:
                continue
            if member["RequestStatus"] in [
                "running-open",
                "running-closed",
                "assignment-approved",
                "assigned",
                "acquired",
            ]:
                print wfo.name, "still has an ACDC running", member["RequestName"]
                acdc.append(member["RequestName"])
                # print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False
                has_recovery_going = True
            elif member["RequestStatus"] == None:
                print member["RequestName"], "is not real"
                pass
            else:
                acdc_inactive.append(member["RequestName"])
                had_any_recovery = True
        ## completion check
        percent_completions = {}
        #        print "let's see who is crashing", wfo.name
        #        print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        if not "TotalInputEvents" in wfi.request:
            event_expected, lumi_expected = 0, 0
            if not "recovery" in wfo.status:
                sendEmail(
                    "missing member of the request",
                    "TotalInputEvents is missing from the workload of %s" % wfo.name,
                    destination=["*****@*****.**"],
                )
        else:
            event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"]

        if "RequestNumEvents" in wfi.request:
            event_expected = int(wfi.request["RequestNumEvents"])
        elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]:
            event_expected = int(wfi.request["Task1"]["RequestNumEvents"])

        fractions_pass = {}
        for output in wfi.request["OutputDatasets"]:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.0
            if lumi_expected:
                percent_completions[output] = lumi_count / float(lumi_expected)
            if event_expected:
                percent_completions[output] = max(percent_completions[output], event_count / float(event_expected))

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and "fractionpass" in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]["fractionpass"]
                print "overriding fraction to", fractions_pass[output], "for", output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to", fractions_pass[output], "by command line for", output

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name, "is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if has_recovery_going:
                sub_assistance += "-recovering"
            elif had_any_recovery:
                ## we want to have this looked at
                sub_assistance += "-manual"
            else:
                sub_assistance += "-recovery"
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request["OutputDatasets"]:
            events_per_lumi[output] = getDatasetEventsPerLumi(output)

        lumi_upper_limit = {}
        for output in wfi.request["OutputDatasets"]:
            upper_limit = 301.0
            campaign = get_campaign(output, wfi)
            # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]["lumisize"]
                print "overriding the upper lumi size to", upper_limit, "for", campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to", upper_limit, "by command line"

            lumi_upper_limit[output] = upper_limit

        if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name, "has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance += "-biglumi"
            is_closing = False

        any_presence = {}
        for output in wfi.request["OutputDatasets"]:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request["OutputDatasets"]:
            custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence = {}
        for output in wfi.request["OutputDatasets"]:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output)

        vetoed_custodial_tier = UC.get("tiers_with_no_custodial")
        out_worth_checking = [
            out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier
        ]
        size_worth_checking = sum(
            [getDatasetSize(out) / 1023.0 for out in out_worth_checking]
        )  ## size in TBs of all outputs
        if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name, "has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]):
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:", custodial, "because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]["custodial"]
                        print "Setting custodial to", custodial, "from campaign configuration"
                        break
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the unified configuration custodial:", custodial, "because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            if not custodial and "InputDataset" in wfi.request:
                ## this is terribly dangerous to assume only
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"])
                ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset", wfi.request[
                        "InputDataset"
                    ], "does not have custodial in the first place. abort"
                    sendEmail(
                        "dataset has no custodial location",
                        "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"],
                    )
                    is_closing = False
                    pick_custodial = False

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:", custodial, "because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for", wfo.name
                sendEmail(
                    "cannot find a custodial",
                    "cannot find a custodial for %s probably because of the total output size %d"
                    % (wfo.name, size_worth_checking),
                )

            if custodial and ((not sub_assistance and not acdc) or by_pass_checks):
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output] >= 1:
                            custodials[custodial].append(output)
                        else:
                            print "no file in phedex for", output, " not good to add to custodial requests"

            is_closing = False

        ## disk copy
        disk_copies = {}
        for output in wfi.request["OutputDatasets"]:
            disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)]

        if not all(map(lambda sites: len(sites) != 0, disk_copies.values())):
            print wfo.name, "has not all output on disk"
            print json.dumps(disk_copies, indent=2)

        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request["OutputDatasets"]:
            dbs_presence[output] = dbs3Client.getFileCountDataset(output)
            dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True)

        fraction_invalid = 0.01
        if (
            not all(
                [
                    dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out])
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if (
            not all(
                [
                    (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out]))
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance += "-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            print "starting duplicate checker for", wfo.name
            for output in wfi.request["OutputDatasets"]:
                print "\tchecking", output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi(output)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi(output)
                    except:
                        print "was not possible to get the duplicate count for", output
                        is_closing = False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name, "has duplicates"
                print json.dumps(duplications, indent=2)
                ## hook for making file invalidation ?
                sub_assistance += "-duplicates"
                is_closing = False

        ## for visualization later on
        if not wfo.name in fDB.record:
            # print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None}
        fDB.record[wfo.name]["closeOutWorkflow"] = is_closing
        for output in wfi.request["OutputDatasets"]:
            if not output in fDB.record[wfo.name]["datasets"]:
                fDB.record[wfo.name]["datasets"][output] = {}
            rec = fDB.record[wfo.name]["datasets"][output]
            rec["percentage"] = float("%.2f" % (percent_completions[output] * 100))
            rec["duplicate"] = duplications[output] if output in duplications else "N/A"
            rec["phedexReqs"] = (
                float("%.2f" % any_presence[output][custodial_presences[output][0]][1])
                if len(custodial_presences[output]) != 0
                else "N/A"
            )
            rec["closeOutDataset"] = is_closing
            rec["transPerc"] = (
                float("%.2f" % any_presence[output][disk_copies[output][0]][1])
                if len(disk_copies[output]) != 0
                else "N/A"
            )
            rec["correctLumis"] = (
                int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            )
            rec["missingSubs"] = (
                False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output])))
            )
            rec["dbsFiles"] = dbs_presence[output]
            rec["dbsInvFiles"] = dbs_invalid[output]
            rec["phedexFiles"] = phedex_presence[output]
            rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive))

        if by_pass_checks:
            ## force closing
            is_closing = True

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting", wfo.name, "closed-out"
            if not options.test:
                if wfo.wm_status in ["closed-out", "announced", "normal-archived"]:
                    print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer", res
                if not res in ["None", None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)

                if res in [None, "None"]:
                    wfo.status = "close"
                    session.commit()
                else:
                    print "could not close out", wfo.name, "will try again next time"
        else:
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            new_status = "assistance" + sub_assistance
            print wfo.name, "needs assistance with", new_status

            if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status:
                pid = wfi.getPrepIDs()[0].replace("task_", "")
                # pid = wfi.request['PrepID'].replace('task_','')
                ## notify
                messages = {
                    "recovery": "Samples completed with missing statistics:\n%s "
                    % (
                        "\n".join(
                            [
                                "%.2f %% complete for %s" % (percent_completions[output] * 100, output)
                                for output in wfi.request["OutputDatasets"]
                            ]
                        )
                    ),
                    "biglumi": "Samples completed with large luminosity blocks:\n%s "
                    % (
                        "\n".join(
                            [
                                "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output)
                                for output in wfi.request["OutputDatasets"]
                                if (events_per_lumi[output] > lumi_upper_limit[output])
                            ]
                        )
                    ),
                    "duplicate": "Samples completed with duplicated luminosity blocks:\n%s"
                    % (
                        "\n".join(
                            [
                                "%s" % output
                                for output in wfi.request["OutputDatasets"]
                                if output in duplications and duplications[output]
                            ]
                        )
                    ),
                }
                text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name)
                content = ""
                for case in messages:
                    if case in new_status:
                        content += "\n" + messages[case] + "\n"
                text += content
                text += "You are invited to check, while this is being taken care of by Ops.\n"
                text += "This is an automated message."
                if use_mcm and content:
                    print "Sending notification back to requestor"
                    print text
                    batches = mcm.getA("batches", query="contains=%s&status=announced" % pid)
                    if len(batches):
                        ## go notify the batch
                        bid = batches[-1]["prepid"]
                        print "batch nofication to", bid
                        mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid})

                    ## go notify the request
                    print "request notification to", pid
                    mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]})

            ## case where the workflow was in manual from recoveror
            if not "manual" in wfo.status or new_status != "assistance-recovery":
                wfo.status = new_status
                if not options.test:
                    print "setting", wfo.name, "to", wfo.status
                    session.commit()
            else:
                print "current status is", wfo.status, "not changing to anything"

    fDB.html()

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ",".join(custodials[site]), "=>", site
        if not options.test:
            result = makeReplicaRequest(
                url,
                site,
                list(set(custodials[site])),
                "custodial copy at production close-out",
                custodial="y",
                priority="low",
                approve=(site in SI.sites_auto_approve),
            )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ",".join(transfers[site]), "=>", site
        if not options.test:
            result = None
            # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Esempio n. 28
0
url = reqmgr_url

may_have_one=set()
may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('away')).all()])
may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()])

wfs = []
wfs.extend( getWorkflows(url, 'running-open', details=True))
wfs.extend( getWorkflows(url, 'running-closed', details=True))
wfs.extend( getWorkflows(url, 'completed', details=True))

may_have_one_too = set()
for wf in wfs:
    if wf['RequestName'] in may_have_one:
        #print wf['RequestName'],"and familly"
        may_have_one_too.update( getWorkflowById(url, wf['PrepID']) )
        
may_have_one.update( may_have_one_too )

for logtype in ['report','joblogs','condorlogs']:
    for d in filter(None,os.popen('ls -d %s/%s/*'%(monitor_dir,logtype)).read().split('\n')):
        if not any([m in d for m in may_have_one]):
            ## that can be removed
            print d,"report file can be removed"
            os.system('rm -rf %s'%d)
        else:
            print d,"is still in use"
    
## protected lfn list
os.system('python listProtectedLFN.py')
Esempio n. 29
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    holdings = []
    #try:
    #    already_notified = json.loads(open('already_notifified.json').read())
    #except:
    #    print "no record of already notified workflow. starting fresh"
    #    already_notified = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        bypasses.extend( mcm_force )

    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False
        pids = wfi.getPrepIDs()
        bypass_by_mcm = False
        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
            if bypass in pids:
                wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass))
                bypass_checks = True
                bypass_by_mcm = True
                break
        
        #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks:
        #    print "No go for",wfo.name
        #    wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign'])
        #    continue


        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = int(wfi.request['Task1']['RequestNumEvents'])

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )
            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        print "These %d files are missing in phedex"%(len(missing_phedex))
                        print "\n".join( missing_phedex )
                    if missing_dbs:
                        print "These %d files are missing in dbs"%(len(missing_dbs))
                        print "\n".join( missing_dbs )

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and bypass_by_mcm:
                        ## shoot large on all prepids
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Esempio n. 30
0
def rejector(url, specific, options=None):

    #use_mcm = True
    #up = componentInfo(mcm=use_mcm, soft=['mcm'])
    up = componentInfo()
    if not up.check(): return
    #use_mcm = up.status['mcm']
    #mcm = McMClient(dev=False) if use_mcm else None

    if specific and specific.startswith('/'):
        ## this is for a dataset
        print setDatasetStatus(specific, 'INVALID')
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend( session.query(Workflow).filter(Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'assistance-clone').all()
        #wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-reject').all())
        ## be careful then on clone case by case
        options.clone = True
        print "not supposed to function yet"
        return 

    print len(wfs),"to reject"

    if len(wfs)>1:
        print "\n".join( [wfo.name for wfo in wfs] )
        answer = raw_input('Reject these')
        if not answer.lower() in ['y','yes']:
            return
        
    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject",spec
            return
        results=[]
        wfi = workflowInfo(url, wfo.name)

        datasets = set(wfi.request['OutputDatasets'])
        reqMgrClient.invalidateWorkflow(url, wfo.name, current_status=wfi.request['RequestStatus'])

        comment=""
        if options.comments: comment = ", reason: "+options.comments

        wfi.sendLog('rejector','invalidating the workflow by unified operator%s'%comment)
        ## need to find the whole familly and reject the whole gang
        familly = getWorkflowById( url, wfi.request['PrepID'] , details=True)
        for fwl in familly:
            if fwl['RequestDate'] < wfi.request['RequestDate']: continue
            if fwl['RequestType']!='Resubmission': continue
            ## does not work on second order acd
            if 'OriginalRequestName' in fwl and fwl['OriginalRequestName'] != wfi.request['RequestName']: continue
            print "rejecting",fwl['RequestName']
            reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False)
            datasets.update( fwl['OutputDatasets'] )

        for dataset in datasets:
            if options.keep:
                print "keeping",dataset,"in its current status"
            else:
                results.append( setDatasetStatus(dataset, 'INVALID') )
                pass


        if all(map(lambda result : result in ['None',None,True],results)):
            print wfo.name,"and",datasets,"are rejected"
            if options and options.clone:
                wfo.status = 'trouble'
                session.commit()                
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(schema['ProcessingVersion'])+1 ## dubious str->int conversion
                else:
                    schema['ProcessingVersion']=2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    if schema['RequestType'] == 'TaskChain':
                        it=1
                        while True:
                            t = 'Task%d'%it
                            it+=1
                            if t in schema:
                                schema[t]['Memory'] = options.Memory
                            else:
                                break
                    else:
                        schema['Memory'] = options.Memory
                        
                if options.Multicore:
                    ## to do : set it properly in taslchains
                    schema['Multicore'] = options.Multicore

                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup']  = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int,options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] =options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1,ntask-1):
                        schema['Task%d'%it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask-1
                    schema.pop('Task%d'%ntask)

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                paramBlacklist = ['BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime',
                                  'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard',
                                  'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize',
                                  'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType',
                                  'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus',
                                  'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions',
                                  'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists',
                                  'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles']
                for p in paramBlacklist:
                    if p in schema:
                        schema.pop( p )
                        #pass
                print "submitting"
                if (options.to_stepchain and (schema['RequestType']=='TaskChain')):
                    ## transform the schema into StepChain schema
                    print "Transforming a TaskChain into a StepChain"
                    schema['RequestType'] = 'StepChain'
                    schema['StepChain'] = schema.pop('TaskChain')
                    step=1
                    while True:
                        if 'Task%d'%step in schema:
                            schema['Step%d'%step] = schema.pop('Task%d'%step)
                            schema['Step%d'%step]['StepName'] = schema['Step%d'%step].pop('TaskName')
                            if 'InputTask' in schema['Step%d'%step]:
                                schema['Step%d'%step]['InputStep'] = schema['Step%d'%step].pop('InputTask')
                            if not 'KeepOutput' in schema['Step%d'%step]:
                                ## this is a weird translation capability. Absence of keepoutput in step means : keep the output. while in TaskChain absence means : drop
                                schema['Step%d'%step]['KeepOutput'] = False
                            step+=1
                        else:
                            break


                print json.dumps( schema, indent=2 )
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    print "error in cloning",wfo.name
                    print json.dumps( schema, indent=2 )
                    return 
                print newWorkflow

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfi.sendLog('rejector','Cloned into %s by unified operator %s'%( newWorkflow, comment ))
                wfi.notifyRequestor('Cloned into %s by unified operator %s'%( newWorkflow, comment ),do_batch=False)
            else:
                wfo.status = 'forget'
                wfi.notifyRequestor('Rejected by unified operator %s'%( comment ),do_batch=False)
                session.commit()

        else:
            print "error in rejecting",wfo.name,results
Esempio n. 31
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        if forcings:
            sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if max_per_round and not spec: wfs = wfs[:max_per_round]

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue
            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                    "\n".join( missing_phedex )))
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Esempio n. 32
0
def invalidator(url, invalid_status='INVALID'):
    use_mcm = True
    up = componentInfo(mcm=use_mcm)
    if not up.check(): return
    mcm = McMClient(dev=False)

    invalids = mcm.getA('invalidations',query='status=announced')
    print len(invalids),"Object to be invalidated"
    text_to_batch = defaultdict(str)
    text_to_request = defaultdict(str)
    for invalid in invalids:
        acknowledge= False
        pid = invalid['prepid']
        batch_lookup = invalid['prepid']
        text = ""
        if invalid['type'] == 'request':
            wfn = invalid['object']
            print "need to invalidate the workflow",wfn
            wfo = session.query(Workflow).filter(Workflow.name == wfn).first()
            if wfo:
                ## set forget of that thing (although checkor will recover from it)
                print "setting the status of",wfo.status,"to forget"
                wfo.status = 'forget'
                session.commit()
            else:
                ## do not go on like this, do not acknoledge it
                print wfn,"is set to be rejected, but we do not know about it yet"
                #continue
            wfi = workflowInfo(url, wfn)
            success = "not rejected"
            ## to do, we should find a way to reject the workflow and any related acdc
            success = reqMgrClient.invalidateWorkflow(url, wfn, current_status = wfi.request['RequestStatus'])
            ## need to find the whole familly and reject the whole gang
            familly = getWorkflowById( url, wfi.request['PrepID'] , details=True)
            for fwl in familly:
                ## take out all acdc
                if fwl['RequestDate'] < wfi.request['RequestDate']:continue
                if fwl['RequestType']!='Resubmission': continue
                print "rejecting",fwl['RequestName']
                success = reqMgrClient.invalidateWorkflow(url, fwl['RequestName'], current_status=fwl['RequestStatus'])
                print success
            wfi.sendLog('invalidator',"rejection is performed from McM invalidations request")
            acknowledge= True
            text = "The workflow %s (%s) was rejected due to invalidation in McM" % ( wfn, pid )
            batch_lookup = wfn ##so that the batch id is taken as the one containing the workflow name
        elif invalid['type'] == 'dataset':
            dataset = invalid['object']

            if '?' in dataset: continue
            if 'None' in dataset: continue
            if 'None-' in dataset: continue
            if 'FAKE-' in dataset: continue

            print "setting",dataset,"to",invalid_status
            success = setDatasetStatus(dataset , invalid_status )
            if success:
                acknowledge= True
                text = "The dataset %s (%s) was set INVALID due to invalidation in McM" % ( dataset, pid )
            else:
                print "invalidation of",dataset,"did not go so well"
        else:
            print "\t\t",invalid['type']," type not recognized"

        if acknowledge:
            ## acknoldge invalidation in mcm, provided we can have the api
            print "acknowledgment to mcm"
            mcm.get('/restapi/invalidations/acknowledge/%s'%( invalid['_id'] ))
            # prepare the text for batches
            batches = []
            batches.extend(mcm.getA('batches',query='contains=%s'%batch_lookup))
            batches = filter(lambda b : b['status'] in ['announced','done','reset'], batches)
            if len(batches):
                bid = batches[-1]['prepid']
                print "batch nofication to",bid
                text_to_batch[bid] += text+"\n\n"
            # prepare the text for requests
            text_to_request[pid] += text+"\n\n"

    for bid,text in text_to_batch.items():    
        if not text: continue
        text += '\n This is an automated message'
        mcm.put('/restapi/batches/notify',{ "notes" : text, "prepid" : bid})
        pass
    for pid,text in text_to_request.items():
        if not text: continue
        text += '\n This is an automated message'
        mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid]})
Esempio n. 33
0
def rejector(url, specific, options=None):

    up = componentInfo()

    if specific and specific.startswith('/'):
        return

    if options.filelist:
        wfs = []
        for line in filter(None, open(options.filelist).read().split('\n')):
            print line
            wfs.extend(
                session.query(Workflow).filter(
                    Workflow.name.contains(line)).all())
    elif specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        return

    print len(wfs), "to reject"

    if len(wfs) > 1:
        print "\n".join([wfo.name for wfo in wfs])
        answer = raw_input('Reject these')
        if not answer.lower() in ['y', 'yes']:
            return

    for wfo in wfs:
        #wfo = session.query(Workflow).filter(Workflow.name == specific).first()
        if not wfo:
            print "cannot reject", spec
            return
        results = []
        wfi = workflowInfo(url, wfo.name)

        datasets = set(wfi.request['OutputDatasets'])
        reqMgrClient.invalidateWorkflow(
            url, wfo.name, current_status=wfi.request['RequestStatus'])

        wfi.sendLog('rejector',
                    'invalidating the workflow by unified operator')
        ## need to find the whole familly and reject the whole gang
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        for fwl in familly:
            if fwl['RequestDate'] < wfi.request['RequestDate']: continue
            if fwl['RequestType'] != 'Resubmission': continue
            if 'OriginalRequestName' in fwl and fwl[
                    'OriginalRequestName'] != wfi.request['RequestName']:
                continue
            print "rejecting", fwl['RequestName']
            reqMgrClient.invalidateWorkflow(
                url, fwl['RequestName'], current_status=fwl['RequestStatus'])
            datasets.update(fwl['OutputDatasets'])

        for dataset in datasets:
            if options.keep:
                print "keeping", dataset, "in its current status"
            else:
                results.append(setDatasetStatus(dataset, 'INVALID'))

        #if wfi.request['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']:
        #    print 'already',wfi.request['RequestStatus']
        #    if not options.clone:
        #        wfo.status = 'forget'
        #        session.commit()
        #        continue

        if all(map(lambda result: result in ['None', None, True], results)):
            wfo.status = 'forget'
            session.commit()
            print wfo.name, "and", datasets, "are rejected"
            if options and options.clone:
                schema = wfi.getSchema()
                schema['Requestor'] = os.getenv('USER')
                schema['Group'] = 'DATAOPS'
                schema['OriginalRequestName'] = wfo.name
                if 'ProcessingVersion' in schema:
                    schema['ProcessingVersion'] = int(
                        schema['ProcessingVersion']
                    ) + 1  ## dubious str->int conversion
                else:
                    schema['ProcessingVersion'] = 2
                for k in schema.keys():
                    if k.startswith('Team'):
                        schema.pop(k)
                    if k.startswith('checkbox'):
                        schema.pop(k)

                ## a few tampering of the original request
                if options.Memory:
                    schema['Memory'] = options.Memory
                if options.deterministic:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['DeterministicPileup'] = True
                if options.EventsPerJob:
                    if schema['RequestType'] == 'TaskChain':
                        schema['Task1']['EventsPerJob'] = options.EventsPerJob
                    else:
                        schema['EventsPerJob'] = options.EventsPerJob
                if options.EventAwareLumiBased:
                    schema['SplittingAlgo'] = 'EventAwareLumiBased'
                if options.TimePerEvent:
                    schema['TimePerEvent'] = options.TimePerEvent

                if options.ProcessingString:
                    schema['ProcessingString'] = options.ProcessingString
                if options.AcquisitionEra:
                    schema['AcquisitionEra'] = options.AcquisitionEra
                if options.runs:
                    schema['RunWhitelist'] = map(int, options.runs.split(','))
                if options.PrepID:
                    schema['PrepID'] = options.PrepID

                if schema['RequestType'] == 'TaskChain' and options.no_output:
                    ntask = schema['TaskChain']
                    for it in range(1, ntask - 1):
                        schema['Task%d' % it]['KeepOutput'] = False
                    schema['TaskChain'] = ntask - 1
                    schema.pop('Task%d' % ntask)

                ## update to the current priority
                schema['RequestPriority'] = wfi.request['RequestPriority']

                ## drop shit on the way to reqmgr2
                for p in [
                        'RequestStatus',
                        'RequestTransition',
                        'RequestorDN',
                        'RequestWorkflow',
                        'OutputDatasets',
                        'ReqMgr2Only',
                        #'Group',
                        'RequestDate',
                        #'ConfigCacheUrl',
                        'RequestName',
                        'timeStamp',
                        'SoftwareVersions',
                        'CouchURL'
                ]:
                    if p in schema:
                        schema.pop(p)
                        #pass
                print "submitting"
                print json.dumps(schema, indent=2)
                newWorkflow = reqMgrClient.submitWorkflow(url, schema)
                if not newWorkflow:
                    print "error in cloning", wfo.name
                    print json.dumps(schema, indent=2)
                    return
                print newWorkflow
                #m = re.search("details\/(.*)\'",response)
                #if not m:
                #    print "error in cloning",wfo.name
                #    print response
                #    print json.dumps( schema, indent=2 )
                #    return
                #newWorkflow = m.group(1)

                data = reqMgrClient.setWorkflowApproved(url, newWorkflow)
                print data
                wfo.status = 'trouble'
                session.commit()
                wfi.sendLog(
                    'rejector',
                    'Cloned into %s by unified operator' % (newWorkflow))
        else:
            print "error in rejecting", wfo.name, results
Esempio n. 34
0
def injector(url, options, specific):
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    UC = unifiedConfiguration()

    transform_keywords = UC.get('convert_to_stepchain')

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    for user in UC.get("user_rereco"):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="ReReco"))
    for user in (options.user_relval.split(',')
                 if options.user_relval else UC.get("user_relval")):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="TaskChain"))
    for user in (options.user_storeresults.split(',') if
                 options.user_storeresults else UC.get("user_storeresults")):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="StoreResults"))

    print len(workflows), "in line"
    cannot_inject = set()
    to_convert = set()
    status_cache = defaultdict(str)

    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue

        exists = session.query(Workflow).filter(Workflow.name == wf).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            familly = session.query(Workflow).filter(
                Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend(getWorkflowById(url, pid, details=True))

                familly = []
                print len(req_familly), "members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url,
                                        req_member['RequestName'],
                                        request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend(
                            session.query(Workflow).filter(
                                Workflow.name ==
                                req_member['RequestName']).all())

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in [
                            'forget', 'trouble', 'forget-unlock',
                            'forget-out-unlock'
                    ]:
                        wfi.sendLog(
                            'injector', "Should not put %s because of %s %s" %
                            (wf, lwfo.name, lwfo.status))
                        sendLog('injector',
                                "Should not put %s because of %s %s" %
                                (wf, lwfo.name, lwfo.status),
                                level='critical')
                        print "Should not put", wf, "because of", lwfo.name, lwfo.status
                        cannot_inject.add(wf)
                        can_add = False
            ## add a check on validity of input datasets
            _, prim, par, sec = wfi.getIO()
            for d in list(prim) + list(par) + list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog(
                        'injector', "One of the input is not VALID. %s : %s" %
                        (d, status_cache[d]))
                    sendLog('injector',
                            "One of the input of %s is not VALID. %s : %s" %
                            (wf, d, status_cache[d]),
                            level='critical')
                    can_add = False
                ## check for any file in phedex, to verify existence
                _, ph_files, _, _ = getDatasetFiles(url, d)
                if not ph_files and not ('StoreResults'
                                         == wfi.request.setdefault(
                                             'RequestType', None)):
                    wfi.sendLog(
                        'injector',
                        "One of the input has no file in phedex: %s" % d)
                    sendLog('injector',
                            "One of the input has no file in phedex: %s" % d,
                            level='critical')
                    can_add = False

            ### ban some workflow that you don't like anymore
            #outputs = wfi.request['OutputDatasets']

            if not can_add: continue

            ## temporary hack to transform specific taskchain into stepchains
            #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords)
            good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None)

            ## match keywords and technical constraints
            #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval():
            #    to_convert.add( wf )
            #    wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf)
            #    #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf)

            wfi.sendLog('injector', "considering %s" % wf)

            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog(
            'injector',
            'These workflow cannot be added in because of duplicates \n\n %s' %
            ('\n'.join(cannot_inject)),
            level='warning')

    for wf in to_convert:
        os.system(
            './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'
            % wf)

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    #print "getting all transfers"
    #all_transfers=session.query(Transfer).all()
    #print "go!"

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name)
        wl = wfi.request  #getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType'] == 'Resubmission': continue
                if fwl['RequestStatus'] in ['None', None, 'new']: continue
                if fwl['RequestStatus'] in [
                        'rejected', 'rejected-archived', 'aborted',
                        'aborted-archived'
                ]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            if wfi.isRelval():
                #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.')
                wfi.sendLog(
                    'injector',
                    'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget'
                )
                wf.status = 'forget'
                session.commit()
            else:
                wfi.sendLog(
                    'injector',
                    'the workflow was found in trouble with no replacement')
                no_replacement.add(wf.name)
            continue
        else:
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble and has a replacement')

        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly) > 1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector',
                    'Multiple wf in line, will take the last one for %s \n%s' %
                    (wf.name, ', '.join(fwl['RequestName']
                                        for fwl in true_familly)),
                    level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(
                Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',
                        "putting %s as replacement of %s" % (member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow(name=member,
                                  status=status,
                                  wm_status=fwl['RequestStatus'])
                wf.status = 'forget'
                session.add(new_wf)
            else:
                if new_wf.status == 'forget': continue
                sendLog(
                    'injector',
                    "getting %s as replacement of %s" % (new_wf.name, wf.name))
                wf.status = 'forget'

            for tr in session.query(TransferImp).filter(
                    TransferImp.workflow_id == wf.id).all():
                ## get all transfer working for the old workflow
                existing = session.query(TransferImp).filter(
                    TransferImp.phedexid == tr.phedexid).filter(
                        TransferImp.workflow_id == new_wf.id).all()
                tr.active = False  ## disable the old one
                if not existing:
                    ## create the transfer object for the new dependency
                    tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf)
                    session.add(tri)
                session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector',
                'workflow with no replacement\n%s \n are dangling there' %
                ('\n'.join(no_replacement)),
                level='critical')
Esempio n. 35
0
import random
from JIRAClient import JIRAClient

up = componentInfo(soft=['mcm','wtc','jira'])
if not up.check(): sys.exit(0)

JC = JIRAClient() if up.status.get('jira',False) else None
if JC:
    those = JC.find({'status' : '!CLOSED'})
    for t in those:
        s= t.fields.summary
        s = s.replace('issues','')
        s = s.strip()
        if s.count(' ')!=0: continue
        print s
        wfs = getWorkflowById(reqmgr_url, s, details=True)
        statuses = set([r['RequestStatus'] for r in wfs])
        check_against = ['assignment-approved', 'running-open','running-closed','completed','acquired', 'assigned', 'closed-out']
        if statuses:
            if all([s not in check_against for s in statuses]):
                print t.key,"can be closed"
                print statuses
                JC.close(t.key) ## uncomment to close JIRAs
                continue
        print t.key,statuses

UC = unifiedConfiguration()

## get all acquired and push one to stepchain so that we can acquire it on nersc
#N_for_cloud = isHEPCloudReady(reqmgr_url)
#if N_for_cloud: