Example #1
0
    def run(self):
        url = self.url
        dataset = self.dataset
        reasons = []
        statuses = [
            'assignment-approved', 'assigned', 'acquired', 'running-open',
            'running-closed', 'completed', 'force-complete', 'closed-out'
        ]

        ##print "reqmgr check on ",dataset
        actors = getWorkflowByInput(url, dataset, details=True)
        using_actors = [
            actor for actor in actors if actor['RequestStatus'] in statuses
        ]
        if len(using_actors):
            reasons.append('input')

        actors = getWorkflowByOutput(url, dataset, details=True)
        using_actors = [
            actor for actor in actors if actor['RequestStatus'] in statuses
        ]
        if len(using_actors):
            reasons.append('output')

        actors = getWorkflowByMCPileup(url, dataset, details=True)
        using_actors = [
            actor for actor in actors if actor['RequestStatus'] in statuses
        ]
        if len(using_actors):
            reasons.append('pilup')

        self.reasons = reasons
Example #2
0
    def run(self):
        url = self.url
        dataset= self.dataset
        reasons = []
        statuses = ['assignment-approved','assigned','acquired','running-open','running-closed','completed','force-complete','closed-out']

        ##print "reqmgr check on ",dataset
        actors = getWorkflowByInput( url, dataset , details=True)
        using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
        if len(using_actors):
            reasons.append('input')

        actors = getWorkflowByOutput( url, dataset , details=True)
        using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
        if len(using_actors):
            reasons.append('output')

        actors = getWorkflowByMCPileup( url, dataset , details=True)
        using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
        if len(using_actors):
            reasons.append('pilup')
        
        self.reasons = reasons
Example #3
0
def outcleanor(url, options):

    if options.approve:
        for user in ['*Vlimant']:#,'*Cremonesi']:
            deletes = listDelete( url , user = user)
            for (site,who,tid) in deletes:
                if 'MSS' in site: continue### ever
                print site,who,tid
                print "approving deletion"
                print approveSubscription(url, tid, nodes = [site], comments = 'Production cleaning by data ops')
        return

    

    sites_and_datasets = defaultdict(list)
    our_copies = defaultdict(list)
    wf_cleaned = {}
    
    wfs = []
    for fetch in options.fetch.split(','):
        wfs.extend(session.query(Workflow).filter(Workflow.status==fetch).all())

    random.shuffle( wfs )
    last_answer = None
    for wfo in wfs :
        if options.number and len(wf_cleaned)>= options.number:
            print "Reached",options.number,"cleaned"
            break
        print '-'*100
        wfi = workflowInfo(url, wfo.name)
        goes = {} # boolean per output
        for dataset in wfi.request['OutputDatasets']:
            goes[dataset] = False
            keep_one_out = True
            status = getDatasetStatus( dataset )
            print "\n\tLooking at",dataset,status,"\n"
            vetoes = None
            if status == 'INVALID':
                vetoes = ['Export','Buffer'] ## can take themselves out
                keep_one_out = False # just wipe clean

            elif status == None:
                print dataset,"actually does not exist. skip"
                goes[dataset] = True
                continue

            elif status in ['PRODUCTION','VALID'] and wfo.status in ['forget','trouble']:
                print dataset,"should probably be invalidated. (",wfo.status,") skip"
                keep_one_out = False # just wipe clean
                continue ## you are not sure. just skip it for the time being

            elif status == 'PRODUCTION' and wfo.status in ['clean']:
                print dataset,"should probably be set valid .skip"
                continue ## you are not sure. just skip it for the time being

            if status == 'VALID' and dataset.startswith('/MinBias'):
                print "This is a /MinBias. skip"
                continue

            if '/DQM' in dataset:
                keep_one_out = False

            total_size = getDatasetSize( dataset )
            
            our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes)
            also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes)
            
            ## merge in one unique dict
            for site in also_our_presence:
                if site in our_presence:
                    there,frac = our_presence[site]
                    other,ofrac = also_our_presence[site]
                    our_presence[site] = (max(there,other),max(frac,ofrac))
                else:
                    our_presence[site] = also_our_presence[site]
                
            if our_presence: print our_presence

            ## analysis ops copies need to be taken into account
            anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps")
            own_by_anaops = anaops_presence.keys()
            
            ## all our copies
            to_be_cleaned = our_presence.keys()
            if not len(to_be_cleaned):
                print "nowhere to be found of ours,",len(own_by_anaops),"in analysi ops pool"
                goes[dataset] = True
                continue

            print "Where we own bits of dataset"
            print to_be_cleaned
     

            if len(own_by_anaops):
                ## remove site with the anaops copies
                to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops))
                keep_one_out = False ## in that case, just remove our copies
                print "Own by anaops (therefore not keep a copy of ours)"
                print own_by_anaops
            else:
                ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet
                using_the_same = getWorkflowByInput(url, dataset, details=True)
                conflict = False
                for other in using_the_same:
                    if other['RequestName'] == wfo.name: continue
                    if other['RequestType'] == 'Resubmission': continue
                    if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','rejected-archived','closed-out','None',None]:
                        print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset
                        conflict=True
                        break
                if conflict:
                    continue

                ## not being used. a bit less dangerous to clean-out
                ## keep one full copy out there
                full_copies = [site for (site,(there,fract)) in our_presence.items() if there]
                if keep_one_out:
                    if not len(full_copies):
                        print "we do not own a full copy of",dataset,status,wfo.status,".skip"
                        continue
                    stay_there = random.choice( full_copies ) #at a place own by ops
                    print "Where we keep a full copy", stay_there
                    to_be_cleaned.remove( stay_there )
                    our_copies[stay_there].append( dataset )
                else:
                    print "We do not want to keep a copy of ",dataset,status,wfo.status

            if len(to_be_cleaned):
                print "Where we can clean"
                print to_be_cleaned
                for site in to_be_cleaned:
                    sites_and_datasets[site].append( (dataset, total_size*our_presence[site][1]/100., status) )
                goes[dataset] = True
            else:
                print "no cleaning to be done"
                goes[dataset] = True

        print wfo.name,"scrutinized"
        if all(goes.values()):
            print "\t",wfo.name,"can toggle -out"
        def ask():
            global last_answer
            last_answer = raw_input('go on ?')
            return last_answer
        if options.auto or ask() in ['y','']:
            if all(goes.values()):
                wfo.status = wfo.status+'-out'
                wf_cleaned[wfo.name] = wfo.status
            continue
        elif last_answer in ['q','n']:
            break
        else:
            return 

    if options.auto:
        pass
    elif last_answer in ['q']:
        return

    print "Potential cleanups"
    for (site,items) in sites_and_datasets.items():
        cleanup = sum([size for (_,size,_) in items])
        print "\n\t potential cleanup of","%8.4f"%cleanup,"GB at ",site
        print "\n".join([ds+" "+st for ds,_,st in items])
        datasets = [ ds for ds,_,st in items]

    print "Copies and bits we are going to delete"
    print json.dumps( sites_and_datasets, indent=2)

    print "Copies we are keeping"
    print json.dumps( our_copies, indent=2 )     

    print "Workflows cleaned for output"
    print json.dumps( wf_cleaned, indent=2 )
    stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
    open('outcleaning_%s.json'%stamp,'w').write( json.dumps( sites_and_datasets, indent=2))
    open('keepcopies_%s.json'%stamp,'w').write( json.dumps( our_copies, indent=2))
    open('wfcleanout_%s.json'%stamp,'w').write( json.dumps( wf_cleaned, indent=2))


    if (not options.test) and (options.auto or raw_input("Satisfied ? (y will trigger status change and deletion requests)") in ['y']):
        for (site,items) in sites_and_datasets.items():
            datasets = [ ds for ds,_,st in items]
            print "making deletion to",site
            result = makeDeleteRequest(url, site, datasets, "Cleanup output after production. DataOps will take care of approving it.")
            print result
            ## approve it right away ?
            if 'MSS' in site: continue
            if 'Export' in site: continue
            if 'Buffer' in site: continue
            for did in [item['id'] for item in result['phedex']['request_created']]:
                print "auto-approve disabled, but ready"
                #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion')
                pass
        session.commit()
    else:
        print "Not making the deletion and changing statuses"
Example #4
0
def stagor(url, specific=None, options=None):

    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0

    lost_blocks = json.loads(
        eosRead('%s/lost_blocks_datasets.json' % monitor_dir))
    lost_files = json.loads(
        eosRead('%s/lost_files_datasets.json' % monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys() + lost_files.keys()):
        b, f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset, "has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f:
            print dataset, "has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s" % (label, nows)
        print "Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in [
                'running-open', 'running-closed', 'completed', 'assigned',
                'acquired'
        ]:
            wfi.sendLog('stagor',
                        "is in status %s" % wfi.request['RequestStatus'])
            wfo.status = 'away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in [
                    'rejected', 'aborted', 'aborted-completed',
                    'aborted-archived', 'rejected-archived'
            ] and wfi.isRelval():
                wfo.status = 'forget'
                session.commit()
                continue
            else:
                print wfo.name, "is", wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor",
                        "%s is in %s, set away" %
                        (wfo.name, wfi.request['RequestStatus']),
                        level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append((wfo, wfi))
        _, primaries, _, secondaries = wfi.getIO()
        for dataset in list(primaries) + list(secondaries):
            needs[wfo.name].append(dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append(dataset)
            wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset))

    time_point("Check staging workflows")

    open('%s/dataset_requirements.json' % monitor_dir,
         'w').write(json.dumps(needs, indent=2))
    for prio in needs_by_priority:
        needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    open('%s/dataset_priorities.json' % monitor_dir,
         'w').write(json.dumps(needs_by_priority, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()

    ## first check if anything is inactive
    all_actives = set([
        transfer.phedexid for transfer in session.query(TransferImp).filter(
            TransferImp.active).all()
    ])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip = False
                sendLog(
                    'stagor', "\t%s is staging for %s" %
                    (imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor', "setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(
        set([
            transfer.phedexid for transfer in session.query(
                TransferImp).filter(TransferImp.active).all()
        ]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor', "read %s from cache" % phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor',
                    'Performing spurious transfer check on %s' % phedexid,
                    level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog(
                    'stagor',
                    'Stagor has got a skewed input from checkTransferStatus',
                    level='critical')
                checks = {}
                pass
            else:
                TS.add(phedexid, checks)

        time_point("Check transfer status %s" % phedexid, sub_lap=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname] = {}
                if not dsname in completion_by_input:
                    completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid] = all(
                    map(lambda i: i >= good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid] = checks[dsname].values()
        if checks:
            sendLog(
                'stagor', "Checks for %s are %s" %
                (phedexid, [node.values() for node in checks.values()]))
            done = all(
                map(
                    lambda i: i >= good_enough,
                    list(
                        itertools.chain.from_iterable(
                            [node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ", phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:  # and tr_wf.status == 'staging':
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {}
                done_by_wf_id[tr_wf.id][phedexid] = done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s, v in checks[ds].items():
                dataset_endpoints[ds].add(s)

        if done:
            sendLog('stagor', "%s is done" % phedexid)
            TS.add(phedexid, checks)
        else:
            sendLog(
                'stagor',
                "%s is not finished %s" % (phedexid, pprint.pformat(checks)))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [
                    SI.SE_to_CE(s) for s, v in checks[ds].items()
                    if v < good_enough
                ]
                sites_incomplete_down = [
                    s for s in sites_incomplete if not s in SI.sites_ready
                ]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]

                if sites_incomplete_down:
                    sendLog(
                        'stagor',
                        "%s are in downtime, while waiting for %s to get there"
                        % (",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update(sites_incomplete_down)
                if sites_incomplete:
                    endpoint_incompleted[ds].update(sites_incomplete)

    time_point("Check on-going transfers")

    print "End points"
    for k in dataset_endpoints:
        dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps(dataset_endpoints, indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime:
        endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps(endpoint_in_downtime, indent=2)

    print "End point incomplete in down time"
    for k in endpoint_incompleted:
        endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps(endpoint_incompleted, indent=2)

    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json' % monitor_dir,
            'w').write(json.dumps(TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json' % monitor_dir,
            'w').write(json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads(
        eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys()
    already_stuck.extend(getAllStuckDataset())

    missing_in_action = defaultdict(list)

    print "-" * 10, "Checking on workflows in staging", "-" * 10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda: defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo, wfi in wfois:
        print "#" * 30
        time_point("Forward checking %s" % wfo.name, sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_, primaries, _, secondaries,
         sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys = {}
        for need in list(primaries) + list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor', "missing transfer report for %s" % need)
                readys[need] = False
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report',
                          '%s does not have a transfer report' % (need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog(
                    'stagor',
                    "assuming it is OK for secondary %s to have no attached transfers"
                    % need)
                readys[need] = True
                done_by_input[need] = {"fake": True}
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor', "%s is ready" % need)
                print json.dumps(done_by_input[need], indent=2)
                readys[need] = True
            else:
                wfi.sendLog(
                    'stagor', "%s is not ready \n%s" %
                    (need, json.dumps(done_by_input[need], indent=2)))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',
                            "all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor', "all needs are fullfilled, already")
                print json.dumps(readys, indent=2)
        else:
            wfi.sendLog('stagor', "missing requirements")
            copies_needed, _ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):  #+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[
                        need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog(
                        'stagor',
                        "%s has only incomplete endpoint in downtime\n%s" %
                        (need, endpoint_in_downtime[need]))
                    re_transfer = True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][
                        se_allowed_key] = getDatasetBlocksFraction(
                            url, need, sites=se_allowed)
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog(
                            'stagor',
                            "assuming it is OK to move on like this already for %s"
                            % need)
                        jump_ahead = True
                    else:
                        wfi.sendLog(
                            'stagor', "Available %s times" %
                            available_cache[need][se_allowed_key])
                        missing_and_downtime = list(
                            set(endpoint_in_downtime[need])
                            & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog(
                                'stagor',
                                "%s is incomplete at %s which is in downtime, trying to move along"
                                % (need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog(
                                'stagor',
                                "continue waiting for transfers for optimum production performance."
                            )

            ## compute a time since staging to filter jump starting ?
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries) + list(secondaries):
                if need in already_stuck:
                    wfi.sendLog('stagor',
                                "%s is stuck, so try to jump ahead" % need)
                    jump_ahead = True

            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead" % wfo.name
                details_text += '\n%s wants %s copies' % (wfo.name,
                                                          copies_needed)
                copies_needed = max(1, copies_needed - 1)
                details_text += '\nlowering by one unit to %s' % copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True

                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][
                            se_allowed_key] = getDatasetPresence(
                                url, need, within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update(presence.keys())
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog(
                        'stagor', "%s is available %s times (%s), at %s" %
                        (need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog(
                        'stagor', "%s is this much transfered %s" %
                        (need, json.dumps(done_by_input[need], indent=2)))
                    all_check &= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:
                    wfi.sendLog(
                        'stagor',
                        "needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name, "has to wait a bit more"
                    wfi.sendLog('stagor', "needs to wait a bit more")
            else:
                wfi.sendLog('stagor', "not checking availability")

            if re_transfer:
                wfi.sendLog(
                    'stagor',
                    "Sending back to considered because of endpoint in downtime"
                )
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add(wfo.name)

    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor',
                "sending back to considered the following workflows \n%s" %
                ('\n'.join(send_back_to_considered)),
                level='critical')

    print "-" * 10, "Checking on non-available datasets", "-" * 10
    ## now check on those that are not fully available

    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min(available_cache[dsname].values())

    really_stuck_dataset = set()

    for dsname, available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(
                Workflow.name == using_it).first()
            if wf:
                using_wfos.append(wf)

        if not len(done_by_input[dsname]):
            print "For dataset", dsname, "there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending", wf.name, "back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor',
                                "%s was send back and might be trouble" %
                                wf.name,
                                level='critical')
                    else:
                        print "would send", wf.name, "back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog(
                            'stagor',
                            "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."
                            % wf.name,
                            level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete", dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks, lost_files = findLostBlocksFiles(
                url, dsname) if (not dsname.endswith('/RAW')) else ([], [])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss, _, n_missing = getDatasetBlockFraction(
                    dsname, lost_block_names)
                print "We have lost", len(
                    lost_block_names
                ), "blocks", lost_block_names, "for %f%%" % (100. *
                                                             fraction_loss)
                if fraction_loss > 0.05:  ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d blocks, for %d events, %3.2f %% loss'
                        % (dsname, len(lost_block_names), n_missing,
                           100 * fraction_loss),
                        level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog(
                                'stagor',
                                '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'
                                % (wf.name, dsname, len(lost_block_names),
                                   n_missing, 100 * fraction_loss),
                                level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog(
                            'stagor',
                            '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_block_names), n_missing,
                               fraction_loss, '\n'.join(lost_block_names)),
                            level='critical')
                        known_lost_blocks[dsname] = [
                            i['name'] for i in lost_blocks
                        ]
                really_stuck_dataset.add(dsname)

            if lost_files:
                fraction_loss, _, n_missing = getDatasetFileFraction(
                    dsname, lost_file_names)
                print "We have lost", len(
                    lost_file_names
                ), "files", lost_file_names, "for %f%%" % fraction_loss

                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d files, for %d events, %f %% loss' %
                        (dsname, len(lost_file_names), n_missing,
                         fraction_loss),
                        level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog(
                            'stagor',
                            '%s is missing %d files, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_file_names), n_missing,
                               fraction_loss, '\n'.join(lost_file_names)),
                            level='critical')
                        known_lost_files[dsname] = [
                            i['name'] for i in lost_files
                        ]

                ## should the status be change to held-staging and pending on a ticket

            missings = [
                pid for (pid, d) in done_by_input[dsname].items() if d == False
            ]
            print "\t", done_by_input[dsname]
            print "\tneeds", len(done_by_input[dsname])
            print "\tgot", done_by_input[dsname].values().count(True)
            print "\tmissing", missings
            missing_in_action[dsname].extend(missings)

    rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_blocks, indent=2))
    rr.close()

    rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_files, indent=2))
    rr.close()

    eosFile('%s/incomplete_transfers.json' % monitor_dir,
            'w').write(json.dumps(missing_in_action, indent=2)).close()
    print "Stuck transfers and datasets"
    print json.dumps(missing_in_action, indent=2)

    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add(dataset)

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add(k, list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json' % base_eos_dir,
            'w').write(json.dumps(list(really_stuck_dataset),
                                  indent=2)).close()
    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    #############
    ## not going further for what matters
    #############
    return
Example #5
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost = json.loads(open('lost_blocks_datasets.json').read())
    still_lost = []
    for dataset in lost:
        l = findLostBlocks(url ,dataset)
        if not l:
            print dataset,"is not really lost"
        else:
            still_lost.append( dataset )
    open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) )

    if options.fast:
        print "doing the fast check of staged with threshold:",options.goodavailability
        for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
            if specific and not specific in wfo.name: continue
            wfi = workflowInfo(url, wfo.name)
            sites_allowed = getSiteWhiteList( wfi.getIO() )
            if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist']
            if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist']))
            _,primaries,_,secondaries = wfi.getIO()
            se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] 
            all_check = True
            for dataset in list(primaries):#+list(secondaries) ?
                #print se_allowed
                available = getDatasetBlocksFraction( url , dataset , sites=se_allowed )
                all_check &= (available >= options.goodavailability)
                if not all_check: break

            if all_check:
                print "\t\t",wfo.name,"can go staged"
                wfo.status = 'staged'
                session.commit()
            else:
                print "\t",wfo.name,"can wait a bit more"
        return 

    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            print wfo.name,"needs",dataset

    for transfer in session.query(Transfer).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    print "\t",transfer.phedexid,"is staging for",tr_wf.name
                    skip=False

        if skip: continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid,"is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            print "Checks for",transfer.phedexid,[node.values() for node in checks.values()]
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done


        if done:
            ## transfer.status = 'done'
            print transfer.phedexid,"is done"
        else:
            print transfer.phedexid,"not finished"
            pprint.pprint( checks )

    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        fractions = None
        if dsname in completion_by_input:
            fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()])
        
        ## the workflows in the waiting room for the dataset
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )
        
        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                    else:
                        print "would send",wf.name,"back to considered"
                        sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
            continue

        #need_sites = int(len(done_by_input[dsname].values())*0.7)+1
        need_sites = len(done_by_input[dsname].values())
        #if need_sites > 10:            need_sites = int(need_sites/2.)
        got = done_by_input[dsname].values().count(True)
        if all([wf.status != 'staging' for wf in using_wfos]):
            ## not a single ds-using wf is in staging => moved on already
            ## just forget about it
            print "presence of",dsname,"does not matter anymore"
            print "\t",done_by_input[dsname]
            print "\t",[wf.status for wf in using_wfos]
            print "\tneeds",need_sites
            continue #??
            
        ## should the need_sites reduces with time ?
        # with dataset choping, reducing that number might work as a block black-list.

        if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()):
            print dsname,"is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif fractions and len(list(fractions))>1 and set(fractions)==1:
            print dsname,"is everywhere at the same fraction"
            print "We do not want this in the end. we want the data we asked for"
            continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us everywhere the same. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif got >= need_sites:
            print dsname,"is almost everywhere we wanted"
            #print "We do not want this in the end. we want the data we asked for"
            #continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is almost with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print "incomplete",dsname
            lost = findLostBlocks(url, dsname)
            try:
                known_lost = json.loads(open('lost_blocks_datasets.json').read())
            except:
                print "enable to get the known_lost from local json file"
                known_lost = []

            if lost and not dsname in known_lost:
                lost_names = [item['name'] for item in lost]
                ## make a deeper investigation of the block location to see whether it's really no-where no-where

                print "We have lost",len(lost),"blocks",lost_names
                #print json.dumps( lost , indent=2 )
                sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 ))
                known_lost.append(dsname)
                rr= open('lost_blocks_datasets.json','w')
                rr.write( json.dumps( known_lost, indent=2))
                rr.close()
                ## should the status be change to held-staging and pending on a ticket



            print "\t",done_by_input[dsname]
            print "\tneeds",need_sites
            print "\tgot",got

    for wfid in done_by_wf_id:
        #print done_by_wf_id[wfid].values()
        ## ask that all related transfer get into a valid state
        if all(done_by_wf_id[wfid].values()):
            pass
Example #6
0
                #print dataset,"is waiting for custodial"
                sum_waiting += size
                remainings[site][dataset]["reasons"].append('tape')

            if dataset in stuck:
                sum_stuck += size
                remainings[site][dataset]["reasons"].append('stuck-tape')
            if dataset in missing:
                sum_missing += size
                remainings[site][dataset]["reasons"].append('missing-tape')

            statuses = [
                'assignment-approved', 'acquired', 'running-open',
                'running-closed', 'completed', 'closed-out'
            ]
            actors = getWorkflowByInput(url, dataset, details=True)
            #actors = [wfc['doc'] for wfc in by_input if wfc['key']==dataset]
            using_actors = [
                actor for actor in actors if actor['RequestStatus'] in statuses
            ]
            if len(using_actors):
                remainings[site][dataset]["reasons"].append('input')
            actors = getWorkflowByOutput(url, dataset, details=True)
            #actors = [wfc['doc'] for wfc in by_output if wfc['key']==dataset]
            using_actors = [
                actor for actor in actors if actor['RequestStatus'] in statuses
            ]
            if len(using_actors):
                remainings[site][dataset]["reasons"].append('output')
            actors = getWorkflowByMCPileup(url, dataset, details=True)
            #actors = [wfc['doc'] for wfc in by_pileup if wfc['key']==dataset]
Example #7
0
def outcleanor(url, options):

    if options.approve:
        for user in ['*Vlimant']:  #,'*Cremonesi']:
            deletes = listDelete(url, user=user)
            for (site, who, tid) in deletes:
                if 'MSS' in site: continue  ### ever
                print site, who, tid
                print "approving deletion"
                print approveSubscription(
                    url,
                    tid,
                    nodes=[site],
                    comments='Production cleaning by data ops')
        return

    sites_and_datasets = defaultdict(list)
    our_copies = defaultdict(list)
    wf_cleaned = {}

    wfs = []
    for fetch in options.fetch.split(','):
        wfs.extend(
            session.query(Workflow).filter(Workflow.status == fetch).all())

    random.shuffle(wfs)
    last_answer = None
    for wfo in wfs:
        if options.number and len(wf_cleaned) >= options.number:
            print "Reached", options.number, "cleaned"
            break
        print '-' * 100
        wfi = workflowInfo(url, wfo.name)
        goes = {}  # boolean per output
        for dataset in wfi.request['OutputDatasets']:
            goes[dataset] = False
            keep_one_out = True
            status = getDatasetStatus(dataset)
            print "\n\tLooking at", dataset, status, "\n"
            vetoes = None
            if status == 'INVALID':
                vetoes = ['Export', 'Buffer']  ## can take themselves out
                keep_one_out = False  # just wipe clean

            elif status == None:
                print dataset, "actually does not exist. skip"
                goes[dataset] = True
                continue

            elif status in ['PRODUCTION', 'VALID'
                            ] and wfo.status in ['forget', 'trouble']:
                print dataset, "should probably be invalidated. (", wfo.status, ") skip"
                keep_one_out = False  # just wipe clean
                continue  ## you are not sure. just skip it for the time being

            elif status == 'PRODUCTION' and wfo.status in ['clean']:
                print dataset, "should probably be set valid .skip"
                continue  ## you are not sure. just skip it for the time being

            if status == 'VALID' and dataset.startswith('/MinBias'):
                print "This is a /MinBias. skip"
                continue

            if '/DQM' in dataset:
                keep_one_out = False

            total_size = getDatasetSize(dataset)

            our_presence = getDatasetPresence(url,
                                              dataset,
                                              complete=None,
                                              group="DataOps",
                                              vetoes=vetoes)
            also_our_presence = getDatasetPresence(url,
                                                   dataset,
                                                   complete=None,
                                                   group="",
                                                   vetoes=vetoes)

            ## merge in one unique dict
            for site in also_our_presence:
                if site in our_presence:
                    there, frac = our_presence[site]
                    other, ofrac = also_our_presence[site]
                    our_presence[site] = (max(there, other), max(frac, ofrac))
                else:
                    our_presence[site] = also_our_presence[site]

            if our_presence: print our_presence

            ## analysis ops copies need to be taken into account
            anaops_presence = getDatasetPresence(url,
                                                 dataset,
                                                 complete=None,
                                                 group="AnalysisOps")
            own_by_anaops = anaops_presence.keys()

            ## all our copies
            to_be_cleaned = our_presence.keys()
            if not len(to_be_cleaned):
                print "nowhere to be found of ours,", len(
                    own_by_anaops), "in analysi ops pool"
                goes[dataset] = True
                continue

            print "Where we own bits of dataset"
            print to_be_cleaned

            if len(own_by_anaops):
                ## remove site with the anaops copies
                to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops))
                keep_one_out = False  ## in that case, just remove our copies
                print "Own by anaops (therefore not keep a copy of ours)"
                print own_by_anaops
            else:
                ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet
                using_the_same = getWorkflowByInput(url, dataset, details=True)
                conflict = False
                for other in using_the_same:
                    if other['RequestName'] == wfo.name: continue
                    if other['RequestType'] == 'Resubmission': continue
                    if not other['RequestStatus'] in [
                            'announced', 'normal-archived', 'aborted',
                            'rejected', 'aborted-archived',
                            'rejected-archived', 'closed-out', 'None', None
                    ]:
                        print other['RequestName'], 'is in status', other[
                            'RequestStatus'], 'preventing from cleaning', dataset
                        conflict = True
                        break
                if conflict:
                    continue

                ## not being used. a bit less dangerous to clean-out
                ## keep one full copy out there
                full_copies = [
                    site for (site, (there, fract)) in our_presence.items()
                    if there
                ]
                if keep_one_out:
                    if not len(full_copies):
                        print "we do not own a full copy of", dataset, status, wfo.status, ".skip"
                        continue
                    stay_there = random.choice(
                        full_copies)  #at a place own by ops
                    print "Where we keep a full copy", stay_there
                    to_be_cleaned.remove(stay_there)
                    our_copies[stay_there].append(dataset)
                else:
                    print "We do not want to keep a copy of ", dataset, status, wfo.status

            if len(to_be_cleaned):
                print "Where we can clean"
                print to_be_cleaned
                for site in to_be_cleaned:
                    sites_and_datasets[site].append(
                        (dataset, total_size * our_presence[site][1] / 100.,
                         status))
                goes[dataset] = True
            else:
                print "no cleaning to be done"
                goes[dataset] = True

        print wfo.name, "scrutinized"
        if all(goes.values()):
            print "\t", wfo.name, "can toggle -out"

        def ask():
            global last_answer
            last_answer = raw_input('go on ?')
            return last_answer

        if options.auto or ask() in ['y', '']:
            if all(goes.values()):
                wfo.status = wfo.status + '-out'
                wf_cleaned[wfo.name] = wfo.status
            continue
        elif last_answer in ['q', 'n']:
            break
        else:
            return

    if options.auto:
        pass
    elif last_answer in ['q']:
        return

    print "Potential cleanups"
    for (site, items) in sites_and_datasets.items():
        cleanup = sum([size for (_, size, _) in items])
        print "\n\t potential cleanup of", "%8.4f" % cleanup, "GB at ", site
        print "\n".join([ds + " " + st for ds, _, st in items])
        datasets = [ds for ds, _, st in items]

    print "Copies and bits we are going to delete"
    print json.dumps(sites_and_datasets, indent=2)

    print "Copies we are keeping"
    print json.dumps(our_copies, indent=2)

    print "Workflows cleaned for output"
    print json.dumps(wf_cleaned, indent=2)
    stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
    open('outcleaning_%s.json' % stamp,
         'w').write(json.dumps(sites_and_datasets, indent=2))
    open('keepcopies_%s.json' % stamp,
         'w').write(json.dumps(our_copies, indent=2))
    open('wfcleanout_%s.json' % stamp,
         'w').write(json.dumps(wf_cleaned, indent=2))

    if (not options.test) and (options.auto or raw_input(
            "Satisfied ? (y will trigger status change and deletion requests)")
                               in ['y']):
        for (site, items) in sites_and_datasets.items():
            datasets = [ds for ds, _, st in items]
            print "making deletion to", site
            result = makeDeleteRequest(
                url, site, datasets,
                "Cleanup output after production. DataOps will take care of approving it."
            )
            print result
            ## approve it right away ?
            if 'MSS' in site: continue
            if 'Export' in site: continue
            if 'Buffer' in site: continue
            for did in [
                    item['id'] for item in result['phedex']['request_created']
            ]:
                print "auto-approve disabled, but ready"
                #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion')
                pass
        session.commit()
    else:
        print "Not making the deletion and changing statuses"
Example #8
0
            else:
                remainings[site][dataset]["reasons"].append('lock')
            if dataset in waiting:
                #print dataset,"is waiting for custodial"
                sum_waiting+=size
                remainings[site][dataset]["reasons"].append('tape')

            if dataset in stuck:
                sum_stuck+=size
                remainings[site][dataset]["reasons"].append('stuck-tape')
            if dataset in missing:
                sum_missing +=size
                remainings[site][dataset]["reasons"].append('missing-tape')

            statuses = ['assignment-approved','acquired','running-open','running-closed','completed','closed-out']
            actors = getWorkflowByInput( url, dataset , details=True)
            #actors = [wfc['doc'] for wfc in by_input if wfc['key']==dataset]
            using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
            if len(using_actors):remainings[site][dataset]["reasons"].append('input')
            actors = getWorkflowByOutput( url, dataset , details=True)
            #actors = [wfc['doc'] for wfc in by_output if wfc['key']==dataset]
            using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
            if len(using_actors):remainings[site][dataset]["reasons"].append('output')
            actors = getWorkflowByMCPileup( url, dataset , details=True)
            #actors = [wfc['doc'] for wfc in by_pileup if wfc['key']==dataset]
            using_actors = [actor for actor in actors if actor['RequestStatus'] in statuses]
            if len(using_actors):remainings[site][dataset]["reasons"].append('pilup')
        
            print dataset,remainings[site][dataset]["reasons"]

        #print "\t",sum_waiting,"[GB] could be freed by custodial"
Example #9
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read())
    lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read())
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys()+lost_files.keys()):
        b,f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset,"has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f: 
            print dataset,"has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    try:
        cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read())
    except:
        print "inexisting transfer statuses. starting fresh"
        cached_transfer_statuses = {}

    transfer_statuses = {}

    ## pop all that are now in negative values
    for phedexid in cached_transfer_statuses.keys():
        transfers = session.query(Transfer).filter(Transfer.phedexid==int(phedexid)).all()
        if not transfers:
            print phedexid,"does not look relevant to be in cache anymore. poping"
            print cached_transfer_statuses.pop( phedexid )


            
    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']:
            wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus'])
            wfi.status='away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            print wfo.name,"is",wfi.request['RequestStatus']
            sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
        wfois.append( (wfo,wfi) )            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            needs[wfo.name].append( dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset))

    open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()
    ## phedexid are set negative when not relevant anymore
    # probably there is a db schema that would allow much faster and simpler query
    for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    sendLog('stagor',"\t%s is staging for %s"%(transfer.phedexid, tr_wf.name))
                    skip=False

        if skip: 
            sendLog('stagor',"setting %s to negative value"%transfer.phedexid)
            transfer.phedexid = -transfer.phedexid
            session.commit()
            continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            sendLog('stagor', "%s is not yet approved"%transfer.phedexid)
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        if str(transfer.phedexid) in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor',"read %s from cache"%transfer.phedexid)
            checks = cached_transfer_statuses[str(transfer.phedexid)]
        else:
            checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)
        ## just write this out
        transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            sendLog('stagor',"Checks for %s are %s"%( transfer.phedexid, [node.values() for node in checks.values()]))
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done
            ## for those that are in staging, and the destination site is in drain
            #if not done and tr_wf.status == 'staging':
                

        for ds in checks:
            for s,v in checks[ds].items():
                dataset_endpoints[ds].add( s )

        if done:
            ## transfer.status = 'done'
            sendLog('stagor',"%s is done"%transfer.phedexid)
            cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks)
        else:
            sendLog('stagor',"%s is not finished %s"%(transfer.phedexid, pprint.pformat( checks )))
            pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough]
                sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready]
                if sites_incomplete_down:
                    sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds))
                #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough]
                #endpoint[ds].update( sites_complete )
                #endpoint[ds].update( sites_incomplete )
                #endpoint_completed[ds].update( sites_complete )
                endpoint_incompleted[ds].update( sites_incomplete )
                endpoint_in_downtime[ds].update( sites_incomplete_down )
            


    print "End point in down time"
    for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps( endpoint_in_downtime , indent=2)


    open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2))
    open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2))

    already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )
    missing_in_action = defaultdict(list)


    print "-"*10,"Checking on workflows in staging","-"*10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda : defaultdict(float))
    presence_cache = defaultdict(dict)
    for wfo,wfi in wfois:
        print "#"*30
        ## the site white list takes site, campaign, memory and core information
        (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys={}
        for need in list(primaries)+list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor',"missing transfer report for %s"%need)
                readys[need] = False      
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report','%s does not have a transfer report'%(need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need)
                readys[need] = True
                done_by_input[need] = { "fake" : True }
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor',"%s is ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = True
            else:
                wfi.sendLog('stagor',"%s is not ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',"all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor',"all needs are fullfilled, already")
                print json.dumps( readys, indent=2 )
        else:
            wfi.sendLog('stagor',"missing requirements")
            copies_needed,_ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):#+list(secondaries):
                if endpoint_in_downtime[need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need)
                    re_transfer=True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][se_allowed_key]  = getDatasetBlocksFraction( url , need, sites=se_allowed )
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need)
                        jump_ahead = True

            ## compute a time since staging to filter jump starting ?                    
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries)+list(secondaries):
                if need in already_stuck: 
                    wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need)
                    jump_ahead = True
                    
            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead"%wfo.name
                details_text += '\n%s wants %s copies'%(wfo.name,copies_needed)
                copies_needed = max(1,copies_needed-1)
                details_text += '\nlowering by one unit to %s'%copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True
                
                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update( presence.keys() )
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog('stagor', "%s is available %s times %s"%( need, available, this_check))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog('stagor',"%s is all transfered %s"%(need, json.dumps(done_by_input[need], indent=2)))
                    all_check&= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check:    
                    wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name,"has to wait a bit more"
                    wfi.sendLog('stagor',"needs to wait a bit more")
            else:
                wfi.sendLog('stagor',"not checking availability")
            if re_transfer:
                wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime")
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add( wfo.name )



    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical')

    print "-"*10,"Checking on non-available datasets","-"*10    
    ## now check on those that are not fully available
    
    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min( available_cache[dsname].values() )
            
    for dsname,available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical')
                    else:
                        print "would send",wf.name,"back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete",dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks,lost_files = findLostBlocksFiles( url, dsname )
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names)
                print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss)
                if fraction_loss > 0.05: ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='warning')
                    ## the workflow should be rejected !
                    for wf in using_wfos: 
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            #sendEmail('doomed workflow','%s has too much loss on the input dataset %s. please check on stagor logs https://cmst2.web.cern.ch/cmst2/unified/logs/stagor/last.log'%(wf.name, dsname))
                            sendLog('stagor', '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical')
                        known_lost_blocks[dsname] = [i['name'] for i in lost_blocks]
                                  
            if lost_files:
                fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names)
                print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss
                
                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove    
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical')
                        known_lost_files[dsname] = [i['name'] for i in lost_files]

                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",len(done_by_input[dsname])
            print "\tgot",done_by_input[dsname].values().count(True)
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )
        


    rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_blocks, indent=2))
    rr.close()

    rr= open('%s/lost_files_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_files, indent=2))
    rr.close()


    open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) )
    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )

    print "Going further and make a report of stuck transfers"

    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    really_stuck_dataset = set()
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid,datasets in datasets_by_phid.items():
        issues = checkTransferLag( url, phid , datasets=list(datasets) )
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones=[]
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer','MSS').replace('Export','MSS') in dones: 
                                continue
                            else: 
                                redones.append( d )
                        else:
                            redones.append( d )
                    dones = list(set( redones ))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay>transfer_timeout and rate<transfer_lowrate:
                        if len(dones)>1:
                            ## its the destination that sucks
                            bad_destinations[destination].add( block )
                        else:
                            dum=[bad_sources[d].add( block ) for d in dones]
                        really_stuck_dataset.add( dataset )
                        print "add",dataset,"to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay)
    print "\n"*2

    ## create tickets right away ?
    report+="\nbad sources "+",".join(bad_sources.keys())+"\n"
    for site,blocks in bad_sources.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))
    report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n"
    for site,blocks in bad_destinations.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))

    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    print '\n'*2,"report written at https://cmst2.web.cern.ch/cmst2/unified/logs/incomplete_transfers.log"
    print report

    stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset])
    print '\n'*2,'Stuck dataset transfers'
    print json.dumps(stuck_transfers , indent=2)
    open('%s/stuck_transfers.json'%monitor_dir,'w').write( json.dumps(stuck_transfers , indent=2) )
    open('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report )
Example #10
0
def stagor(url, specific=None):
    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        ## implement the grace period for by-passing the transfer.
        pass

    for transfer in session.query(Transfer).all():
        if specific and str(transfer.phedexid) != str(specific): continue

        skip = True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:
                if tr_wf.status == 'staging':
                    skip = False
                    break

        if skip: continue
        if transfer.phedexid < 0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url, transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid, "is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname] = {}
                if not dsname in completion_by_input:
                    completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid] = all(
                    map(lambda i: i >= good_enough, checks[dsname].values()))
                completion_by_input[dsname][
                    transfer.phedexid] = checks[dsname].values()
        if checks:
            print "Checks for", transfer.phedexid, [
                node.values() for node in checks.values()
            ]
            done = all(
                map(
                    lambda i: i >= good_enough,
                    list(
                        itertools.chain.from_iterable(
                            [node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ", transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:  # and tr_wf.status == 'staging':
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {}
                done_by_wf_id[tr_wf.id][transfer.phedexid] = done

        if done:
            ## transfer.status = 'done'
            print transfer.phedexid, "is done"
        else:
            print transfer.phedexid, "not finished"
            pprint.pprint(checks)

    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        fractions = None
        if dsname in completion_by_input:
            fractions = itertools.chain.from_iterable(
                [check.values() for check in completion_by_input.values()])

        ## the workflows in the waiting room for the dataset
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(
                Workflow.name == using_it).first()
            if wf:
                using_wfos.append(wf)

        #need_sites = int(len(done_by_input[dsname].values())*0.7)+1
        need_sites = len(done_by_input[dsname].values())
        if need_sites > 10:
            need_sites = int(need_sites / 2.)
        got = done_by_input[dsname].values().count(True)
        if all([wf.status != 'staging' for wf in using_wfos]):
            ## not a single ds-using wf is in staging => moved on already
            ## just forget about it
            print "presence of", dsname, "does not matter anymore"
            print "\t", done_by_input[dsname]
            print "\t", [wf.status for wf in using_wfos]
            print "\tneeds", need_sites
            continue  #??

        ## should the need_sites reduces with time ?
        # with dataset choping, reducing that number might work as a block black-list.

        if all(done_by_input[dsname].values()):
            print dsname, "is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name, "is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif fractions and len(list(fractions)) > 1 and set(fractions) == 1:
            print dsname, "is everywhere at the same fraction"
            print "We do not want this in the end. we want the data we asked for"
            continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name, "is with us everywhere the same. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif got >= need_sites:
            print dsname, "is almost everywhere we wanted"
            #print "We do not want this in the end. we want the data we asked for"
            #continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name, "is almost with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print dsname
            print "\t", done_by_input[dsname]
            print "\tneeds", need_sites
            print "\tgot", got

    for wfid in done_by_wf_id:
        #print done_by_wf_id[wfid].values()
        ## ask that all related transfer get into a valid state
        if all(done_by_wf_id[wfid].values()):
            pass
Example #11
0
def stagor(url,specific =None):
    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        ## implement the grace period for by-passing the transfer.
        pass

    for transfer in session.query(Transfer).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    skip=False
                    break

        if skip: continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid,"is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            print "Checks for",transfer.phedexid,[node.values() for node in checks.values()]
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done


        if done:
            ## transfer.status = 'done'
            print transfer.phedexid,"is done"
        else:
            print transfer.phedexid,"not finished"
            pprint.pprint( checks )

    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        fractions = None
        if dsname in completion_by_input:
            fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()])
        
        ## the workflows in the waiting room for the dataset
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        #need_sites = int(len(done_by_input[dsname].values())*0.7)+1
        need_sites = len(done_by_input[dsname].values())
        if need_sites > 10:
            need_sites = int(need_sites/2.)
        got = done_by_input[dsname].values().count(True)
        if all([wf.status != 'staging' for wf in using_wfos]):
            ## not a single ds-using wf is in staging => moved on already
            ## just forget about it
            print "presence of",dsname,"does not matter anymore"
            print "\t",done_by_input[dsname]
            print "\t",[wf.status for wf in using_wfos]
            print "\tneeds",need_sites
            continue #??
            
        ## should the need_sites reduces with time ?
        # with dataset choping, reducing that number might work as a block black-list.

        if all(done_by_input[dsname].values()):
            print dsname,"is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif fractions and len(list(fractions))>1 and set(fractions)==1:
            print dsname,"is everywhere at the same fraction"
            print "We do not want this in the end. we want the data we asked for"
            continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us everywhere the same. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif got >= need_sites:
            print dsname,"is almost everywhere we wanted"
            #print "We do not want this in the end. we want the data we asked for"
            #continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is almost with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print dsname
            print "\t",done_by_input[dsname]
            print "\tneeds",need_sites
            print "\tgot",got

    for wfid in done_by_wf_id:
        #print done_by_wf_id[wfid].values()
        ## ask that all related transfer get into a valid state
        if all(done_by_wf_id[wfid].values()):
            pass
Example #12
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read())
    lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read())
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys()+lost_files.keys()):
        b,f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset,"has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f: 
            print dataset,"has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    try:
        cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read())
    except:
        print "inexisting transfer statuses. starting fresh"
        cached_transfer_statuses = {}
        return False

    transfer_statuses = {}


    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())


    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']:
            wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus'])
            wfi.status='away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            print wfo.name,"is",wfi.request['RequestStatus']
            sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
        wfois.append( (wfo,wfi) )            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            needs[wfo.name].append( dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append( dataset )
            wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset))

    time_point("Check staging workflows")            

    open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2))
    for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    open('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2))
        

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()


    ## first check if anything is inactive
    all_actives = set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip =False
                sendLog('stagor',"\t%s is staging for %s"%(imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor',"setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if str(phedexid) in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor',"read %s from cache"%phedexid)
            checks = cached_transfer_statuses[str(phedexid)]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor','Performing spurious transfer check on %s'% phedexid, level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog('stagor','Stagor has got a skewed input from checkTransferStatus', level='critical')
                checks = {}
                pass
            #checks = {}
            #not_cached = True

        time_point("Check transfer status %s"% phedexid, sub_lap=True)            

        ## just write this out
        transfer_statuses[str(phedexid)] = copy.deepcopy(checks)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid]=checks[dsname].values()
        if checks:
            sendLog('stagor',"Checks for %s are %s"%( phedexid, [node.values() for node in checks.values()]))
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ",phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][phedexid]=done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s,v in checks[ds].items():
                dataset_endpoints[ds].add( s )

        if done:
            sendLog('stagor',"%s is done"%phedexid)
            cached_transfer_statuses[str(phedexid)] = copy.deepcopy(checks)
        else:
            sendLog('stagor',"%s is not finished %s"%(phedexid, pprint.pformat( checks )))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough]
                sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]



                if sites_incomplete_down:
                    sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds))
                #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough]
                #endpoint[ds].update( sites_complete )
                #endpoint[ds].update( sites_incomplete )
                #endpoint_completed[ds].update( sites_complete )
                endpoint_incompleted[ds].update( sites_incomplete )
                endpoint_in_downtime[ds].update( sites_incomplete_down )
            

    time_point("Check on-going transfers")            


    print "End points"
    for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps( dataset_endpoints , indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps( endpoint_in_downtime , indent=2)    

    print "End point incomplete in down time"
    for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps( endpoint_incompleted , indent=2)        


    #open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2))
    open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2))

    already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() ).keys()
    already_stuck.extend( getAllStuckDataset() )
 
    missing_in_action = defaultdict(list)


    print "-"*10,"Checking on workflows in staging","-"*10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda : defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo,wfi in wfois:
        print "#"*30
        time_point("Forward checking %s"% wfo.name,sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys={}
        for need in list(primaries)+list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor',"missing transfer report for %s"%need)
                readys[need] = False      
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report','%s does not have a transfer report'%(need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need)
                readys[need] = True
                done_by_input[need] = { "fake" : True }
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor',"%s is ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = True
            else:
                wfi.sendLog('stagor',"%s is not ready \n%s"%(need,json.dumps( done_by_input[need] , indent=2)))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',"all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor',"all needs are fullfilled, already")
                print json.dumps( readys, indent=2 )
        else:
            wfi.sendLog('stagor',"missing requirements")
            copies_needed,_ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):#+list(secondaries):
                if endpoint_in_downtime[need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need)
                    re_transfer=True
                
                if not se_allowed_key in available_cache[need]:
                    available_cache[need][se_allowed_key]  = getDatasetBlocksFraction( url , need, sites=se_allowed )
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need)
                        jump_ahead = True
                    else:
                        wfi.sendLog('stagor',"Available %s times"% available_cache[need][se_allowed_key])
                        missing_and_downtime = list(set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog('stagor',"%s is incomplete at %s which is in downtime, trying to move along"%(need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog('stagor',"continue waiting for transfers for optimum production performance.")



            ## compute a time since staging to filter jump starting ?                    
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries)+list(secondaries):
                if need in already_stuck: 
                    wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need)
                    jump_ahead = True
                    
            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead"%wfo.name
                details_text += '\n%s wants %s copies'%(wfo.name,copies_needed)
                copies_needed = max(1,copies_needed-1)
                details_text += '\nlowering by one unit to %s'%copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True
                
                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update( presence.keys() )
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog('stagor', "%s is available %s times (%s), at %s"%( need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog('stagor',"%s is this much transfered %s"%(need, json.dumps(done_by_input[need], indent=2)))
                    all_check&= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:    
                    wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name,"has to wait a bit more"
                    wfi.sendLog('stagor',"needs to wait a bit more")
            else:
                wfi.sendLog('stagor',"not checking availability")

            if re_transfer:
                wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime")
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add( wfo.name )


    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical')

    print "-"*10,"Checking on non-available datasets","-"*10    
    ## now check on those that are not fully available
    
    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min( available_cache[dsname].values() )

    really_stuck_dataset = set()

    for dsname,available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical')
                    else:
                        print "would send",wf.name,"back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete",dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) if (not dsname.endswith('/RAW')) else ([],[])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names)
                print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss)
                if fraction_loss > 0.05: ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos: 
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog('stagor', '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical')
                        known_lost_blocks[dsname] = [i['name'] for i in lost_blocks]
                really_stuck_dataset.add( dsname )
                  
            if lost_files:
                fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names)
                print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss
                
                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove    
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical')
                        known_lost_files[dsname] = [i['name'] for i in lost_files]

                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",len(done_by_input[dsname])
            print "\tgot",done_by_input[dsname].values().count(True)
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )
        


    rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_blocks, indent=2))
    rr.close()

    rr= open('%s/lost_files_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_files, indent=2))
    rr.close()


    open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) )
    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )


    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    for k in datasets_by_phid:
        datasets_by_phid[k] = list(datasets_by_phid[k])

    open('datasets_by_phid.json','w').write( json.dumps(datasets_by_phid, indent=2 ))

    open('really_stuck_dataset.json','w').write( json.dumps(list(really_stuck_dataset), indent=2 ))
    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    #############
    ## not going further for what matters
    #############
    return 
Example #13
0
def cleanor(url, specific=None):
    print "Deprecated"
    return

    if duplicateLock() : return 

    delete_per_site = {}
    do_not_autoapprove = []#'T2_FR_CCIN2P3']
    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()

    counts=0
    for wfo in session.query(Workflow).filter(Workflow.status == 'done').all():
        keep_a_copy = False
        if specific and not specific in wfo.name: continue
        ## what was in input 
        wl = getWorkLoad(url,  wfo.name )

        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['clean-in']==False:
            print "Skipping cleaning on input for campaign",wl['Campaign'], "as per campaign configuration"
            continue

        dataset= 'N/A'
        if 'InputDataset' in wl:
            dataset = wl['InputDataset']

        print dataset,"in input"
        #print json.dumps(wl, indent=2)
        announced_log = filter(lambda change : change["Status"] in ["closed-out","normal-archived","announced"],wl['RequestTransition'])
        if not announced_log: 
            print "Cannot figure out when",wfo.name,"was finished"
            continue
        now = time.mktime(time.gmtime()) / (60*60*24.)
        then = announced_log[-1]['UpdateTime'] / (60.*60.*24.)
        if (now-then) <2:
            print "workflow",wfo.name, "finished",now-then,"days ago. Too fresh to clean"
            continue
        else:
            print "workflow",wfo.name,"has finished",now-then,"days ago."

        if not 'InputDataset' in wl: 
            ## should we set status = clean ? or something even further
            print "passing along",wfo.name,"with no input"
            wfo.status = 'clean'
            session.commit()
            continue

        if 'MinBias' in dataset:
            print "Should not clean anything using",dataset,"setting status further"
            wfo.status = 'clean'
            session.commit()
            continue

        total_size = getDatasetSize( dataset ) ## in Gb        
        #if counts> 20:            break
        counts+=1
        ## find any location it is at
        our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps")
        also_our_presence = getDatasetPresence(url, dataset, complete=None, group="")

        ## is there a custodial !!!
        custodials = findCustodialLocation(url, dataset)
        if not len(custodials):
            print dataset,"has no custodial site yet, excluding from cleaning"
            continue

        ## find out whether it is still in use
        using_the_same = getWorkflowByInput(url, dataset, details=True)
        conflict=False
        for other in using_the_same:
            if other['RequestName'] == wfo.name: continue
            if other['RequestType'] == 'Resubmission': continue
            if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','aborted-completed','rejected-archived','closed-out','None',None,'new']:
                print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset
                conflict=True
                break
            if 'Campaign' in other and other['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[other['Campaign']] and CI.campaigns[other['Campaign']]['clean-in']==False:
                print other['RequestName'],'is in campaign',other['Campaign']
                conflict = True
                break
        if conflict: continue
        print "other statuses:",[other['RequestStatus'] for other in using_the_same if other['RequestName'] != wfo.name]


        ## find all disks
        to_be_cleaned = filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,our_presence.keys())
        to_be_cleaned.extend( filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,also_our_presence.keys()))
        print to_be_cleaned,"for",total_size,"GB"

        anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps")
        own_by_anaops = anaops_presence.keys()
        print "Own by analysis ops and vetoing"
        print own_by_anaops
        ## need to black list the sites where there is a copy of analysis ops
        to_be_cleaned = [site for site in to_be_cleaned if not site in own_by_anaops ]

        ## keep one copy out there
        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'keep-one' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['keep-one']==True:
            print "Keeping a copy of input for",wl['Campaign']
            keep_a_copy = True
            
        if keep_a_copy:
            keep_at = None
            full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T1')]
            full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T1')] )
            if not full_copies:
                full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T2')]
                full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T2')] )

            if full_copies:
                keep_at = random.choice( full_copies )
                
            if not keep_at:
                print "We are enable to find a place to keep a full copy of",dataset,"skipping"
                continue
            else:
                ## keeping that copy !
                print "Keeping a full copy of",dataset,"at",keep_at,"not setting the status further"
                to_be_cleaned.remove( keep_at )
        else:
            wfo.status = 'clean'

        ## collect delete request per site
        for site in to_be_cleaned :
            if not site in delete_per_site: delete_per_site[site] = []
            if not dataset in [existing[0] for existing in delete_per_site[site]]:
                delete_per_site[site].append( (dataset, total_size) )
        
        session.commit()

    #open('deletes.json','w').write( json.dumps(delete_per_site,indent=2) )

    print json.dumps(delete_per_site, indent=2)
    print "\n\n ------- \n\n"
    ## unroll the deletion per site
    ## maybe find the optimum site/dataset dataset/site to limit the number of ph requests
    for site in delete_per_site:
        dataset_list = [info[0] for info in delete_per_site[site]]
        size_removal = sum([info[1] for info in delete_per_site[site]]) / 1024.
        if site in SI.disk:
            free = SI.disk[site]
            print site,"has",size_removal,"TB of potential cleanup.",free,"TB available."
        else:
            print site,"has",size_removal,"TB of potential cleanup. no info on available."

        print "\t",','.join(dataset_list)
    
    ## make deletion requests
    for site in delete_per_site:
        site_datasets = [info[0] for info in delete_per_site[site]]
        is_tape = any([v in site for v in ['MSS','Export','Buffer'] ])
        #comments="Cleanup input after production. DataOps will take care of approving it."
        #if is_tape:
        #    comments="Cleanup input after production."
        for item in site_datasets:
            LI.release( item, site, 'cleanup of input after production')
Example #14
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = global_SI
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost = json.loads(open('lost_blocks_datasets.json').read())
    still_lost = []
    for dataset in lost:
        l = findLostBlocks(url ,dataset)
        if not l:
            print dataset,"is not really lost"
        else:
            still_lost.append( dataset )
    open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) )

    cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read())

    if options.fast:
        print "doing the fast check of staged with threshold:",options.goodavailability
        for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
            if specific and not specific in wfo.name: continue
            wfi = workflowInfo(url, wfo.name)
            (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList()
            if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist']
            if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist']))
            se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] 
            all_check = True
            n_copies = wfi.getNCopies()
            for dataset in list(primaries):#+list(secondaries) ?
                #print se_allowed
                available = getDatasetBlocksFraction( url , dataset , sites=se_allowed )
                #all_check &= (available >= options.goodavailability)
                all_check &= (available >= n_copies)
                if not all_check: break

            if all_check:
                print "\t\t",wfo.name,"can go staged"
                wfo.status = 'staged'
                session.commit()
            else:
                print "\t",wfo.name,"can wait a bit more"
        return 

    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed']:
            print wfo.name,"is",wfi.request['RequestStatus']
            wfi.status='away'
            session.commit()
            continue
            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            print wfo.name,"needs",dataset

    ## this loop is very expensive and will not function at some point.
    ## transfer objects should probably be deleted as some point

    for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    print "\t",transfer.phedexid,"is staging for",tr_wf.name
                    skip=False

        if skip: 
            print "setting",transfer.phedexid,"to negative value"
            transfer.phedexid = -transfer.phedexid
            session.commit()
            continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid,"is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        if str(transfer.phedexid) in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            print "read",transfer.phedexid,"from cache"
            checks = cached_transfer_statuses[str(transfer.phedexid)]
        else:
            checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            print "Checks for",transfer.phedexid,[node.values() for node in checks.values()]
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done
            ## for those that are in staging, and the destination site is in drain
            #if not done and tr_wf.status == 'staging':
                

        if done:
            ## transfer.status = 'done'
            print transfer.phedexid,"is done"
            cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks)
        else:
            print transfer.phedexid,"not finished"
            pprint.pprint( checks )

    open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2))

    missing_in_action = defaultdict(list)
    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        fractions = None
        if dsname in completion_by_input:
            fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()])
        
        ## the workflows in the waiting room for the dataset
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )
        
        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                    else:
                        print "would send",wf.name,"back to considered"
                        sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
            continue

        #need_sites = int(len(done_by_input[dsname].values())*0.7)+1
        need_sites = len(done_by_input[dsname].values())
        #if need_sites > 10:            need_sites = int(need_sites/2.)
        got = done_by_input[dsname].values().count(True)
        if all([wf.status != 'staging' for wf in using_wfos]):
            ## not a single ds-using wf is in staging => moved on already
            ## just forget about it
            #print "presence of",dsname,"does not matter anymore"
            #print "\t",done_by_input[dsname]
            #print "\t",[wf.status for wf in using_wfos]
            #print "\tneeds",need_sites
            continue
            
        ## should the need_sites reduces with time ?
        # with dataset choping, reducing that number might work as a block black-list.

        if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()):
            print dsname,"is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif fractions and len(list(fractions))>1 and set(fractions)==1:
            print dsname,"is everywhere at the same fraction"
            print "We do not want this in the end. we want the data we asked for"
            continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us everywhere the same. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif got >= need_sites:
            print dsname,"is almost everywhere we wanted"
            #print "We do not want this in the end. we want the data we asked for"
            #continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is almost with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print "incomplete",dsname
            lost = findLostBlocks(url, dsname)
            lost_names = [item['name'] for item in lost]
            try:
                known_lost = json.loads(open('lost_blocks_datasets.json').read())
            except:
                print "enable to get the known_lost from local json file"
                known_lost = []

            if lost:
                print "We have lost",len(lost),"blocks",lost_names
                #print json.dumps( lost , indent=2 )

            if lost and not dsname in known_lost:
                ## make a deeper investigation of the block location to see whether it's really no-where no-where
                sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 ))
                known_lost.append(dsname)
                rr= open('lost_blocks_datasets.json','w')
                rr.write( json.dumps( known_lost, indent=2))
                rr.close()
                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",need_sites
            print "\tgot",got
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )

    open('/afs/cern.ch/user/c/cmst2/www/unified/incomplete_transfers.json','w').write( json.dumps(missing_in_action, indent=2) )
    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )

    print "Going further and make a report of stuck transfers"

    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    really_stuck_dataset = set()
    for phid,datasets in datasets_by_phid.items():
        issues = checkTransferLag( url, phid , datasets=list(datasets) )
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones=[]
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer','MSS').replace('Export','MSS') in dones: 
                                continue
                            else: 
                                redones.append( d )
                        else:
                            redones.append( d )
                    dones = list(set( redones ))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay>7 and rate<0.0004:
                        if len(dones)>1:
                            ## its the destination that sucks
                            bad_destinations[destination].add( block )
                        else:
                            dum=[bad_sources[d].add( block ) for d in dones]
                        really_stuck_dataset.add( dataset )
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay)
    print "\n"*2

    ## create tickets right away ?
    report+="\nbad sources "+",".join(bad_sources.keys())+"\n"
    for site,blocks in bad_sources.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))
    report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n"
    for site,blocks in bad_destinations.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))

    print report

    open('/afs/cern.ch/user/c/cmst2/www/unified/stuck_transfers.json','w').write( json.dumps(dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]), indent=2) )
    open('/afs/cern.ch/user/c/cmst2/www/unified/logs/incomplete_transfers.log','w').write( report )
Example #15
0
def stagor(url,specific =None, good_enough = 99.9):
    done_by_wf_id = {}
    done_by_input = {}
    for transfer in session.query(Transfer).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    skip=False
                    break

        if skip: continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid,"is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        checks = checkTransferStatus(url, transfer.phedexid)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>good_enough, checks[dsname].values()))
        if checks:
            done = all(map(lambda i:i>good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done


        if done:
            ## transfer.status = 'done'
            print transfer.phedexid,"is done"
        else:
            print transfer.phedexid,"not finished"
            pprint.pprint( checks )

    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        if all(done_by_input[dsname].values()):
            print dsname,"is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            using_its = getWorkflowByInput(url, dsname)
            #print using_its
            for using_it in using_its:
                wf = session.query(Workflow).filter(Workflow.name == using_it).first()
                if wf and wf.status == 'staging':
                    print wf.name,"is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print dsname,done_by_input[dsname]

    for wfid in done_by_wf_id:
        #print done_by_wf_id[wfid].values()
        ## ask that all related transfer get into a valid state
        if all(done_by_wf_id[wfid].values()):
            pass