Beispiel #1
0
print "="*20
for site in sorted(jobs_for.keys()):
    report += '-'*10+site+'-'*10+'\n'
    for camp in sorted(jobs_for[site].keys()):
        report += "%s @ %s : %d potential jobs\n"%(camp,site,int(jobs_for[site][camp]))
        for wf in sorted(wf_for[site][camp]):
            report +="\t %s \n"%wf
    #print report

if not_runable_acdc:
    sendLog('GQ','These %s ACDC cannot run \n%s'%( len(not_runable_acdc),
                                                   '\n'.join(sorted(not_runable_acdc))
                                                   ),level='critical')


old_stuck_all_done = set(json.loads(eosRead('%s/stuck_all_done.json'%base_eos_dir)))
really_stuck_all_done = old_stuck_all_done & stuck_all_done
if really_stuck_all_done:
    sendLog('GQ','These %d workflows have not toggled further to completed while all WQE are done\n%s'%( len(really_stuck_all_done),'\n'.join(sorted(really_stuck_all_done))),
            level='critical')
eosFile('%s/stuck_all_done.json'%base_eos_dir,'w').write( json.dumps( sorted( stuck_all_done), indent=2)).close()

if failed_workflow:
    sendLog('GQ','These workflows have failed wqe and will stay stuck:\n%s'%('\n'.join( failed_workflow)))
    pass

if agents_down:
    for agent,tasks in agents_down.iteritems():
        if not tasks: continue
        #sendLog('GQ','These tasks look stalled in agent %s \n%s'%( agent, '\n'.join(sorted(tasks))),level='critical')
        pass
Beispiel #2
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).filter(
                ~Workflow.status.contains('custodial')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    max_per_round = UC.get('max_per_round').get('transferor', None)

    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    all_to_include = session.query(Workflow).filter(
        Workflow.status.startswith('considered')).all()
    if len(cache) > 2000:
        max_to_include = max_per_round
        random.shuffle(cache)  ## randomize first by wf name
        cache = sorted(cache, key=lambda r: r['RequestPriority'],
                       reverse=True)  ## order by prio
        highest = [r['RequestName'] for r in cache[:max_to_include]]
        all_to_include = [wfo for wfo in all_to_include if wfo.name in highest]
        print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len(
            all_to_include)

    for wfo in all_to_include:
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = defaultdict(float)
    ignored_input_sizes = defaultdict(float)
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read())
    stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name, "staging"
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        blocks = wfh.getBlocks()
        for prim in primary:
            ds_s = dss.get(prim, blocks=blocks)
            if prim in stucks:
                wfh.sendLog(
                    'transferor',
                    "%s appears stuck, so not counting it %s [GB]" %
                    (prim, ds_s))
                ignored_input_sizes[prim] = max(ds_s,
                                                ignored_input_sizes[prim])
            else:
                input_sizes[prim] = max(ds_s, input_sizes[prim])
                wfh.sendLog('transferor',
                            "%s needs %s [GB]" % (wfo.name, ds_s))
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    input_blocks = {}
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        blocks = wfh.getBlocks()
        input_blocks[wfo.name] = blocks
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim, blocks=blocks)
            input_sizes[prim] = max(prim_size, input_sizes[prim])
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendLog(
            "transferor",
            "No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request[
                'RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = defaultdict(float)
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                if wfh.isRelval():
                    wfo.status = 'forget'
                else:
                    wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        blocks = input_blocks.get(wfo.name, wfh.getBlocks())
        if blocks:
            print "Reading only", len(blocks), "blocks in input"
        this_load = sum([dss.get(prim, blocks=blocks) for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = (not wfh.isRelval())
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update(CI.campaigns[campaign])
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'transferor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('transferor',
                            'These data tiers %s are not allowed in %s' %
                            (','.join(banned_tier), wfo.name),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('transferor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('transferor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update(allowed_secondary[sec])

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(
                set(sites_allowed) & set(overide_parameters['SiteWhitelist']))
            wfh.sendLog(
                'transferor',
                'Intersecting with the overriding whitelist parameters, allowed sites become {}'
                .format(sites_allowed))

        if no_go:
            continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue
        #    break ## try this for a while to make things faster

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            LI.lock(dataset, reason='staging')

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                if blocks:
                    print "limiting to blocks", "\n".join(sorted(blocks))
                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                #prim_destination = [site for site in destinations.keys() if not site in prim_location]
                prim_destination = [
                    site for (site, info) in destinations.items()
                    if info['data_fraction'] == 1 and info['completion'] != 100
                ]
                ## veto the site with no current disk space, for things that are not relval
                prim_destination = [
                    site for site in prim_destination
                    if (SI.disk[site] or wfh.isRelval())
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "Counting existing copies ; now need %s" % copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:

                    existings = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(latching)).filter(
                            TransferImp.workflow_id == wfo.id).all()
                    if not existings:
                        tri = TransferImp(phedexid=int(latching), workflow=wfo)
                        print "adding", wfo.id, "with phedexid", latching
                        session.add(tri)
                    else:
                        for existing in existings:
                            existing.active = True

                    session.flush()

                    can_go = False
                    transfer_sizes[prim] = max(this_load, transfer_sizes[prim])
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The input is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim))
                    sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim),
                        level='critical')

                    print json.dumps(prim_to_distribute, indent=2)
                    print json.dumps(prim_location, indent=2)
                    print json.dumps(prim_destination, indent=2)

                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute
                        if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                    ]

                    print "changed to"
                    print json.dumps(prim_to_distribute, indent=2)

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        ## prune the blocks/destination that are already in the making, so that subscription don't overlap
                        for site in spreading:
                            for block in list(spreading[site]):
                                if site in destinations and block in destinations[
                                        site]['blocks'].keys():
                                    ## prune it
                                    spreading[site].remove(block)

                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = max(this_load,
                                                   transfer_sizes[prim])
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination = overide_parameters[
                    'SecondaryLocation']
            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = set(
                        [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if k in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    print sec, json.dumps(destinations, indent=2)
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]  ## this is in SE
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                ## how to make unified understand that it has to wait for the secondary if the sec_destination and

                #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in sec_location
                ]
                #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if not SI.CE_to_SE(site) in sec_destination
                ]
                presitespace_sec_to_distribute = copy.deepcopy(
                    sec_to_distribute)
                #sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                #sec_to_distribute = [site for site in sec_to_distribute if not  SI.CE_to_SE(site) in SI.sites_veto_transfer]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]
                ## at this point you have a problem
                if len(sec_to_distribute) == 0 and len(
                        presitespace_sec_to_distribute):
                    sendLog(
                        'transferor',
                        '%s is getting no possible destinations because of lack of space. To be decided what to do in general'
                        % (sec),
                        level='critical')

                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] *
                                1024.) > sec_size or wfh.isRelval():
                            wfh.sendLog('transferor',
                                        'Sending %s to %s' % (sec, site))
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). %s will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024, wfo.name),
                                    level='critical')
                                wfh.sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024))
                else:
                    ## this is bas overall
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog(
                        'transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
                    #session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(sorted(no_goes)),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        #if site in SI.sites_veto_transfer:
        #    print site,"does not want transfers"
        #    continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue
        transfered_items = defaultdict(set)
        if execute:
            priority = 'normal'
            cds = [
                ds for ds in set(datasets + block_datasets)
                if ds in max_priority
            ]
            ## bucketize the transfers by priority of workflows
            prioritized_items = defaultdict(set)
            for item in items_to_transfer:
                d = item.split('#')[0]
                p = max_priority.get(d, 80000)
                q = 'normal'
                if p > 100000:
                    q = 'reserved'
                elif p < 70000:
                    q = 'low'
                prioritized_items[q].add(item)

            for priority, items in prioritized_items.items():
                result = makeReplicaRequest(url,
                                            site_se,
                                            list(items),
                                            'prestaging',
                                            priority=priority,
                                            approve=True)
                if result:
                    these_transfers = [
                        o['id'] for o in result['phedex']['request_created']
                    ]
                    #phedexids.extend( these_transfers )
                    for ph in these_transfers:
                        transfered_items[ph].update(items)
                else:
                    sendLog(
                        'transferor',
                        'Could not make a replica request for items %s to site %s'
                        % (items, site_se),
                        level='critical')

            #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
            #phedexids = [o['id'] for o in result['phedex']['request_created']]:
        #else:
        #    #result= {'phedex':{'request_created' : []}}
        #    phedexids = []
        #    fake_id-=1

        if not transfered_items:
            sendLog(
                'transferor',
                'Could not make a replica request for items %s to site %s' %
                (items_to_transfer, site),
                level='critical')
            continue
        for phedexid, items in transfered_items.items():
            print phedexid, "transfer created"
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items))):
                for wfid in workflow_dependencies[transfering]:
                    new_transfer = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(phedexid)).filter(
                            TransferImp.workflow_id == wfid).first()
                    if not new_transfer:
                        new_transfer = TransferImp(
                            phedexid=phedexid,
                            workflow=session.query(Workflow).get(wfid))
                        session.add(new_transfer)
                    else:
                        new_transfer.active = True

                    wf_id_in_prestaging.add(wfid)
            #session.commit()

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()
Beispiel #3
0
    report += '-' * 10 + site + '-' * 10 + '\n'
    for camp in sorted(jobs_for[site].keys()):
        report += "%s @ %s : %d potential jobs\n" % (camp, site,
                                                     int(jobs_for[site][camp]))
        for wf in sorted(wf_for[site][camp]):
            report += "\t %s \n" % wf
    #print report

if not_runable_acdc:
    sendLog('GQ',
            'These %s ACDC cannot run \n%s' %
            (len(not_runable_acdc), '\n'.join(sorted(not_runable_acdc))),
            level='critical')

old_stuck_all_done = set(
    json.loads(eosRead('%s/stuck_all_done.json' % base_eos_dir)))
really_stuck_all_done = old_stuck_all_done & stuck_all_done
if really_stuck_all_done:
    sendLog(
        'GQ',
        'These %d workflows have not toggled further to completed while all WQE are done\n%s'
        %
        (len(really_stuck_all_done), '\n'.join(sorted(really_stuck_all_done))),
        level='critical')
eosFile('%s/stuck_all_done.json' % base_eos_dir,
        'w').write(json.dumps(sorted(stuck_all_done), indent=2)).close()

if failed_workflow:
    sendLog(
        'GQ', 'These workflows have failed wqe and will stay stuck:\n%s' %
        ('\n'.join(failed_workflow)))
Beispiel #4
0
def mappor(url, options=None):

    up = componentInfo(soft=['mcm', 'wtc', 'jira'])

    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)

    UC = unifiedConfiguration()
    over_rides = []
    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    if use_T0: over_rides.append('T0_CH_CERN')

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    if use_HLT: over_rides.append('T2_CH_CERN_HLT')

    use_CSCS = ('T0_CH_CSCS_HPC' in UC.get("site_for_overflow"))
    if options.cscs: use_CSCS = True
    if use_CSCS: over_rides.append('T0_CH_CSCS_HPC')

    SI = global_SI(over_rides)
    #print sorted(SI.all_sites)
    #print sorted(SI.sites_T0s)

    CI = campaignInfo()

    #sites_to_consider = SI.all_sites
    sites_to_consider = SI.sites_ready
    for site in sites_to_consider:
        region = site.split('_')[1]
        if not region in [
                'US',
                'DE',
                'IT',
                'FR',
                'CH',
                'ES',
                'UK',
                'RU'  ### latest addition
        ]:
            continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in sites_to_consider:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        within_region = [
            fb for fb in sites_to_consider
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]
        #print site,region, within_region
        mapping[site] = within_region

    for site in sites_to_consider:
        if site.split('_')[1] == 'US':  ## to all site in the US
            ## add NERSC
            mapping[site].append('T3_US_NERSC')
            mapping[site].append('T3_US_SDSC')
            mapping[site].append('T3_US_TACC')
            mapping[site].append('T3_US_PSC')
            ## add OSG
            mapping[site].append('T3_US_OSG')
            #mapping[site].append('T3_US_Colorado')
            pass

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        ## who can read from T0
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        mapping['T1_IT_CNAF'].append('T0_CH_CERN')
        mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')
        mapping['T1_DE_KIT'].append('T0_CH_CERN')

    if use_CSCS:
        ## analog config to T0:
        mapping['T2_CH_CERN'].append('T0_CH_CSCS_HPC')
        mapping['T1_IT_CNAF'].append('T0_CH_CSCS_HPC')
        mapping['T1_FR_CCIN2P3'].append('T0_CH_CSCS_HPC')
        mapping['T1_DE_KIT'].append('T0_CH_CSCS_HPC')

    ## temptatively
    mapping['T0_CH_CERN'].append('T2_CH_CERN')

    ## all europ can read from CERN
    for reg in ['IT', 'DE', 'UK', 'FR', 'BE', 'ES']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in sites_to_consider if '_%s_' % reg in fb])
        pass

    ## all europ T1 among each others
    europ_t1 = [
        site for site in sites_to_consider if site.startswith('T1')
        and any([reg in site for reg in ['IT', 'DE', 'UK', 'FR', 'ES', 'RU']])
    ]
    #print europ_t1
    for one in europ_t1:
        for two in europ_t1:
            if one == two: continue
            mapping[one].append(two)
            pass
        ## all EU T1 can read from T0
        mapping['T0_CH_CERN'].append(one)

    mapping['T0_CH_CERN'].append('T1_US_FNAL')
    #mapping['T1_IT_CNAF'].append( 'T1_US_FNAL' )
    #mapping['T1_IT_CNAF'].extend( [site for site in SI.sites_ready if '_US_' in site] ) ## all US can read from CNAF
    mapping['T1_IT_CNAF'].append('T2_CH_CERN')
    mapping['T1_DE_KIT'].append('T2_CH_CERN')
    mapping['T2_CH_CERN'].append('T1_IT_CNAF')
    mapping['T2_CH_CERN'].append('T1_US_FNAL')
    mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula')
    mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula_REHA')

    for site in sites_to_consider:
        if '_US_' in site:
            mapping[site].append('T2_CH_CERN')
    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads(eosRead('%s/GQ.json' % monitor_dir))
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)
            pass

    take_site_out = UC.get('site_out_of_overflow')

    for site, fallbacks in mapping.items():
        mapping[site] = list(set(fallbacks))

    ### mapping is a dictionnary where
    # key can read from site in values.
    ### reverserd mapping is a dictionnary where
    # key can be read by site in values.
    ## create the reverse mapping for the condor module
    for site, fallbacks in mapping.items():
        if site in take_site_out:
            print "taking", site, "out of overflow source by unified configuration"
            mapping.pop(site)
            continue
        for fb in fallbacks:
            if fb == site:
                ## remove self
                mapping[site].remove(fb)
                continue
            if fb in take_site_out:
                ## remove those to be removed
                print "taking", fb, "out of overflow destination by unified configuration"
                mapping[site].remove(fb)
                continue
            if not site in reversed_mapping[fb]:
                reversed_mapping[fb].append(site)

    ## write it out and bail
    cache = cacheInfo()
    cache.store('overflow_mapping', mapping)
    cache.store('overflow_reverse_mapping', reversed_mapping)
    return
        table += "</table>\n"
        table += "<table border=1></thead><tr><th>Dataset</th><th>Size [GB]</th><th>Label</th></tr></thead>\n"
        for item in ld:
            table+="<tr><td>%s</td><td>%d</td><td><ul>%s</ul></td></tr>\n"%( item[0], item[1]['size'], "<li>".join([""]+item[1]['reasons']))
        table+="</table></html>"
        #open('%s/remaining_%s.html'%(monitor_dir,site),'w').write( table )
        eosFile('%s/remaining_%s.html'%(monitor_dir,site),'w').write( table ).close()

    #open('%s/remaining.json'%monitor_dir,'w').write( json.dumps( remainings , indent=2))
    eosFile('%s/remaining.json'%monitor_dir,'w').write( json.dumps( remainings , indent=2)).close()

else:
    si = siteInfo()
    remainings={}
    for fn in filter(None,os.popen('ls -1 %s/remaining_*.json | sort '%monitor_dir).read().split('\n')):
        site = fn.split('_',1)[-1].split('.')[0]
        load = json.loads( eosRead(fn))
        remainings[site] = load
        if si.disk[site] : continue
        print site,si.disk[site],"[TB] free",si.quota[site],"[TB] quota"

        if not load: continue
        tags = ['pilup','input','output','lock','unlock','tape','stuck-tape','missing-tape']
        for tag in tags:
            v = sum([ info['size'] for ds,info in load.items() if tag in info['reasons']]) / 1024.
            print "\t %10f [TB] remaining because of %s"%(v,tag)
    #open('%s/remaining.json'%monitor_dir,'w').write( json.dumps( remainings , indent=2))
    eosFile('%s/remaining.json'%monitor_dir,'w').write( json.dumps( remainings , indent=2)).close()

Beispiel #6
0
now = time.mktime( time.gmtime())

## can we catch the datasets that actually should go to tape ?
custodial_override = {}
for c in CI.campaigns:
    if 'custodial_override' in CI.campaigns[c]:
        custodial_override[c] = CI.campaigns[c]['custodial_override']

newly_locking = set()
also_locking_from_reqmgr = set()

LI = lockInfo()

## add an addHoc list of things to lock. empyting this list would result in unlocking later
try:
    addHocLocks = json.loads( eosRead('%s/addhoc_lock.json'%base_eos_dir))
except:
    addHocLocks = []
    sys.exit(0)

time_point("Starting addhoc")

for item in addHocLocks:
    ds = item.split('#')[0]
    LI.lock( ds , reason='addhoc lock')
    newly_locking.add( ds )


time_point("Starting reversed statuses check")

for status in statuses:
Beispiel #7
0
def batchor(url):
    UC = unifiedConfiguration()
    SI = global_SI()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend(
            getWorkflows(url,
                         'assignment-approved',
                         details=True,
                         user=user,
                         rtype='TaskChain'))

    wfs = filter(
        lambda r: r['SubRequestType'] == 'RelVal'
        if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter(
        lambda r: r['SubRequestType'] == 'HIRelVal'
        if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:", wf['RequestName'], wf['Campaign']
        by_campaign[wf['Campaign']].add(wf['PrepID'])

    for wf in hi_wfs:
        print "HI Relval:", wf['RequestName'], wf['Campaign']
        by_hi_campaign[wf['Campaign']].add(wf['PrepID'])

    default_setup = {
        "go": True,
        "parameters": {
            "SiteWhitelist": ["T1_US_FNAL"],
            "MergedLFNBase": "/store/relval",
            "Team": "relval",
            "NonCustodialGroup": "RelVal"
        },
        "custodial_override": "notape",
        "phedex_group": "RelVal",
        "lumisize": -1,
        "fractionpass": 0.0,
        "maxcopies": 1
    }
    default_hi_setup = copy.deepcopy(default_setup)

    add_on = {}
    batches = json.loads(eosRead('%s/batches.json' % base_eos_dir))
    relval_routing = UC.get('relval_routing')

    def pick_one_site(p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(
                p["parameters"]["SiteWhitelist"]) > 1:
            choose_from = list(
                set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice(choose_from)
            print "picked", picked, "from", choose_from
            p["parameters"]["SiteWhitelist"] = [picked]

    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_setup)

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword", key
                print "with", augment_with
                setup = deep_update(setup, augment_with)

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(
            set(
                list(copy.deepcopy(by_campaign[campaign])) +
                batches[campaign]))

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_hi_setup)
        possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"])
        hi_site = random.choice(list(possible_sites))
        setup["parameters"]["SiteWhitelist"] = [hi_site]

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the HI relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(
            set(
                list(copy.deepcopy(by_hi_campaign[campaign])) +
                batches[campaign]))

    eosFile('%s/batches.json' % base_eos_dir,
            'w').write(json.dumps(batches, indent=2)).close()

    ## open the campaign configuration
    campaigns = json.loads(eosRead('%s/campaigns.relval.json' % base_eos_dir))

    ## protect for overwriting ??
    for new_campaign in list(set(add_on.keys()) - set(campaigns.keys())):
        ## this is new, and can be announced as such
        print new_campaign, "is new stuff"
        subject = "Request of RelVal samples batch %s" % new_campaign
        text = """Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message""" % (
            new_campaign,
            new_campaign,
        )

        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor', text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in campaigns.keys():
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        if not all_in_batch: continue
        is_batch_done = all(
            map(
                lambda s: not s in [
                    'completed', 'force-complete', 'running-open',
                    'running-closed', 'acquired', 'assigned',
                    'assignment-approved'
                ], [wf['RequestStatus'] for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            campaigns.pop(old_campaign)  ## or just drop it all together ?
            print "batch", old_campaign, " configuration was removed"

    ## merge all anyways
    campaigns.update(add_on)

    ## write it out for posterity
    open('campaigns.json.updated', 'w').write(json.dumps(campaigns, indent=2))

    ## read back
    rread = json.loads(open('campaigns.json.updated').read())

    os.system('cp campaigns.json.updated %s/campaigns.relval.json' %
              monitor_dir)
    os.system('cp campaigns.json.updated %s/campaigns.relval.json' %
              base_eos_dir)
Beispiel #8
0
def stagor(url, specific=None, options=None):

    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0

    lost_blocks = json.loads(
        eosRead('%s/lost_blocks_datasets.json' % monitor_dir))
    lost_files = json.loads(
        eosRead('%s/lost_files_datasets.json' % monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys() + lost_files.keys()):
        b, f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset, "has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f:
            print dataset, "has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s" % (label, nows)
        print "Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in [
                'running-open', 'running-closed', 'completed', 'assigned',
                'acquired'
        ]:
            wfi.sendLog('stagor',
                        "is in status %s" % wfi.request['RequestStatus'])
            wfo.status = 'away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in [
                    'rejected', 'aborted', 'aborted-completed',
                    'aborted-archived', 'rejected-archived'
            ] and wfi.isRelval():
                wfo.status = 'forget'
                session.commit()
                continue
            else:
                print wfo.name, "is", wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor",
                        "%s is in %s, set away" %
                        (wfo.name, wfi.request['RequestStatus']),
                        level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append((wfo, wfi))
        _, primaries, _, secondaries = wfi.getIO()
        for dataset in list(primaries) + list(secondaries):
            needs[wfo.name].append(dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append(dataset)
            wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset))

    time_point("Check staging workflows")

    open('%s/dataset_requirements.json' % monitor_dir,
         'w').write(json.dumps(needs, indent=2))
    for prio in needs_by_priority:
        needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    open('%s/dataset_priorities.json' % monitor_dir,
         'w').write(json.dumps(needs_by_priority, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()

    ## first check if anything is inactive
    all_actives = set([
        transfer.phedexid for transfer in session.query(TransferImp).filter(
            TransferImp.active).all()
    ])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip = False
                sendLog(
                    'stagor', "\t%s is staging for %s" %
                    (imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor', "setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(
        set([
            transfer.phedexid for transfer in session.query(
                TransferImp).filter(TransferImp.active).all()
        ]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor', "read %s from cache" % phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor',
                    'Performing spurious transfer check on %s' % phedexid,
                    level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog(
                    'stagor',
                    'Stagor has got a skewed input from checkTransferStatus',
                    level='critical')
                checks = {}
                pass
            else:
                TS.add(phedexid, checks)

        time_point("Check transfer status %s" % phedexid, sub_lap=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname] = {}
                if not dsname in completion_by_input:
                    completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid] = all(
                    map(lambda i: i >= good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid] = checks[dsname].values()
        if checks:
            sendLog(
                'stagor', "Checks for %s are %s" %
                (phedexid, [node.values() for node in checks.values()]))
            done = all(
                map(
                    lambda i: i >= good_enough,
                    list(
                        itertools.chain.from_iterable(
                            [node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ", phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:  # and tr_wf.status == 'staging':
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {}
                done_by_wf_id[tr_wf.id][phedexid] = done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s, v in checks[ds].items():
                dataset_endpoints[ds].add(s)

        if done:
            sendLog('stagor', "%s is done" % phedexid)
            TS.add(phedexid, checks)
        else:
            sendLog(
                'stagor',
                "%s is not finished %s" % (phedexid, pprint.pformat(checks)))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [
                    SI.SE_to_CE(s) for s, v in checks[ds].items()
                    if v < good_enough
                ]
                sites_incomplete_down = [
                    s for s in sites_incomplete if not s in SI.sites_ready
                ]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]

                if sites_incomplete_down:
                    sendLog(
                        'stagor',
                        "%s are in downtime, while waiting for %s to get there"
                        % (",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update(sites_incomplete_down)
                if sites_incomplete:
                    endpoint_incompleted[ds].update(sites_incomplete)

    time_point("Check on-going transfers")

    print "End points"
    for k in dataset_endpoints:
        dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps(dataset_endpoints, indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime:
        endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps(endpoint_in_downtime, indent=2)

    print "End point incomplete in down time"
    for k in endpoint_incompleted:
        endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps(endpoint_incompleted, indent=2)

    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json' % monitor_dir,
            'w').write(json.dumps(TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json' % monitor_dir,
            'w').write(json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads(
        eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys()
    already_stuck.extend(getAllStuckDataset())

    missing_in_action = defaultdict(list)

    print "-" * 10, "Checking on workflows in staging", "-" * 10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda: defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo, wfi in wfois:
        print "#" * 30
        time_point("Forward checking %s" % wfo.name, sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_, primaries, _, secondaries,
         sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys = {}
        for need in list(primaries) + list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor', "missing transfer report for %s" % need)
                readys[need] = False
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report',
                          '%s does not have a transfer report' % (need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog(
                    'stagor',
                    "assuming it is OK for secondary %s to have no attached transfers"
                    % need)
                readys[need] = True
                done_by_input[need] = {"fake": True}
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor', "%s is ready" % need)
                print json.dumps(done_by_input[need], indent=2)
                readys[need] = True
            else:
                wfi.sendLog(
                    'stagor', "%s is not ready \n%s" %
                    (need, json.dumps(done_by_input[need], indent=2)))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',
                            "all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor', "all needs are fullfilled, already")
                print json.dumps(readys, indent=2)
        else:
            wfi.sendLog('stagor', "missing requirements")
            copies_needed, _ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):  #+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[
                        need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog(
                        'stagor',
                        "%s has only incomplete endpoint in downtime\n%s" %
                        (need, endpoint_in_downtime[need]))
                    re_transfer = True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][
                        se_allowed_key] = getDatasetBlocksFraction(
                            url, need, sites=se_allowed)
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog(
                            'stagor',
                            "assuming it is OK to move on like this already for %s"
                            % need)
                        jump_ahead = True
                    else:
                        wfi.sendLog(
                            'stagor', "Available %s times" %
                            available_cache[need][se_allowed_key])
                        missing_and_downtime = list(
                            set(endpoint_in_downtime[need])
                            & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog(
                                'stagor',
                                "%s is incomplete at %s which is in downtime, trying to move along"
                                % (need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog(
                                'stagor',
                                "continue waiting for transfers for optimum production performance."
                            )

            ## compute a time since staging to filter jump starting ?
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries) + list(secondaries):
                if need in already_stuck:
                    wfi.sendLog('stagor',
                                "%s is stuck, so try to jump ahead" % need)
                    jump_ahead = True

            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead" % wfo.name
                details_text += '\n%s wants %s copies' % (wfo.name,
                                                          copies_needed)
                copies_needed = max(1, copies_needed - 1)
                details_text += '\nlowering by one unit to %s' % copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True

                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][
                            se_allowed_key] = getDatasetPresence(
                                url, need, within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update(presence.keys())
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog(
                        'stagor', "%s is available %s times (%s), at %s" %
                        (need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog(
                        'stagor', "%s is this much transfered %s" %
                        (need, json.dumps(done_by_input[need], indent=2)))
                    all_check &= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:
                    wfi.sendLog(
                        'stagor',
                        "needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name, "has to wait a bit more"
                    wfi.sendLog('stagor', "needs to wait a bit more")
            else:
                wfi.sendLog('stagor', "not checking availability")

            if re_transfer:
                wfi.sendLog(
                    'stagor',
                    "Sending back to considered because of endpoint in downtime"
                )
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add(wfo.name)

    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor',
                "sending back to considered the following workflows \n%s" %
                ('\n'.join(send_back_to_considered)),
                level='critical')

    print "-" * 10, "Checking on non-available datasets", "-" * 10
    ## now check on those that are not fully available

    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min(available_cache[dsname].values())

    really_stuck_dataset = set()

    for dsname, available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(
                Workflow.name == using_it).first()
            if wf:
                using_wfos.append(wf)

        if not len(done_by_input[dsname]):
            print "For dataset", dsname, "there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending", wf.name, "back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor',
                                "%s was send back and might be trouble" %
                                wf.name,
                                level='critical')
                    else:
                        print "would send", wf.name, "back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog(
                            'stagor',
                            "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."
                            % wf.name,
                            level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete", dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks, lost_files = findLostBlocksFiles(
                url, dsname) if (not dsname.endswith('/RAW')) else ([], [])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss, _, n_missing = getDatasetBlockFraction(
                    dsname, lost_block_names)
                print "We have lost", len(
                    lost_block_names
                ), "blocks", lost_block_names, "for %f%%" % (100. *
                                                             fraction_loss)
                if fraction_loss > 0.05:  ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d blocks, for %d events, %3.2f %% loss'
                        % (dsname, len(lost_block_names), n_missing,
                           100 * fraction_loss),
                        level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog(
                                'stagor',
                                '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'
                                % (wf.name, dsname, len(lost_block_names),
                                   n_missing, 100 * fraction_loss),
                                level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog(
                            'stagor',
                            '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_block_names), n_missing,
                               fraction_loss, '\n'.join(lost_block_names)),
                            level='critical')
                        known_lost_blocks[dsname] = [
                            i['name'] for i in lost_blocks
                        ]
                really_stuck_dataset.add(dsname)

            if lost_files:
                fraction_loss, _, n_missing = getDatasetFileFraction(
                    dsname, lost_file_names)
                print "We have lost", len(
                    lost_file_names
                ), "files", lost_file_names, "for %f%%" % fraction_loss

                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d files, for %d events, %f %% loss' %
                        (dsname, len(lost_file_names), n_missing,
                         fraction_loss),
                        level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog(
                            'stagor',
                            '%s is missing %d files, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_file_names), n_missing,
                               fraction_loss, '\n'.join(lost_file_names)),
                            level='critical')
                        known_lost_files[dsname] = [
                            i['name'] for i in lost_files
                        ]

                ## should the status be change to held-staging and pending on a ticket

            missings = [
                pid for (pid, d) in done_by_input[dsname].items() if d == False
            ]
            print "\t", done_by_input[dsname]
            print "\tneeds", len(done_by_input[dsname])
            print "\tgot", done_by_input[dsname].values().count(True)
            print "\tmissing", missings
            missing_in_action[dsname].extend(missings)

    rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_blocks, indent=2))
    rr.close()

    rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_files, indent=2))
    rr.close()

    eosFile('%s/incomplete_transfers.json' % monitor_dir,
            'w').write(json.dumps(missing_in_action, indent=2)).close()
    print "Stuck transfers and datasets"
    print json.dumps(missing_in_action, indent=2)

    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add(dataset)

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add(k, list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json' % base_eos_dir,
            'w').write(json.dumps(list(really_stuck_dataset),
                                  indent=2)).close()
    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    #############
    ## not going further for what matters
    #############
    return
Beispiel #9
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        eosRead('%s/dataset_endpoints.json' % monitor_dir))
    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlocks()

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor',
            "Intersecting with secondary requirement, now allowed %s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            closeAllBlocks(url, prim, blocks)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = set()
                for (psite, (there, frac)) in presence.items():
                    if there:
                        sites_all_data.update(SI.SE_to_CEs(psite))
                sites_all_data = list(sites_all_data)
                #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = set()
                for psite in presence.keys():
                    sites_with_any_data.update(SI.SE_to_CEs(psite))
                sites_with_any_data = list(sites_with_any_data)
                #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            holding_but_not_allowed = set()
            for se_site in presence.keys():
                if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)):
                    holding_but_not_allowed.add(se_site)
            #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" %
                sorted(holding_but_not_allowed))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix():
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Beispiel #10
0
 look_in_files = []
 look_in_files.extend(
     filter(
         None,
         os.popen('find %s -name "*.log"' %
                  (condor_dir)).read().split('\n')))
 look_in_files.extend(
     filter(
         None,
         os.popen('find %s -name "*.out"' %
                  (condor_dir)).read().split('\n')))
 lfs = []
 flfs = []
 for out in look_in_files:
     print "Looking in file", out
     fhr = eosRead(out, trials=2)
     if not fhr: continue
     for line in fhr.split('\n'):
         if 'logArchive.tar.gz' in line:
             fullpath = filter(lambda w: 'logArchive.tar.gz' in w,
                               line.split())[0]
             alf = fullpath.split('/')[-1].strip()
             if not '/' in fullpath or alf == 'logArchive.tar.gz' or ':' in fullpath:
                 print fullpath, "not satisfactory to find log file name"
                 #sendLog('efficiencor','check on the logs of efficiencor, for %s'%(wf),level='critical')
                 alf = None
                 fullpath = None
                 continue
             print "found log name", alf, fullpath, "in condor log", out.split(
                 '/')[-1]
             #print "full name",fullpath
Beispiel #11
0
def stuckor(url=reqmgr_url):
    mlock = moduleLock()
    if mlock(): return
    TD = transferDataset()
    datasets_by_phid = TD.content()
    really_stuck_dataset = set(
        json.loads(eosRead('%s/really_stuck_dataset.json' % base_eos_dir)))

    UC = unifiedConfiguration()

    print "make a report of stuck transfers"
    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid, datasets in datasets_by_phid.items():
        issues = checkTransferLag(url, phid, datasets=list(datasets))
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size, destination_size, delay, rate,
                     dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones = []
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer',
                                         'MSS').replace('Export',
                                                        'MSS') in dones:
                                continue
                            else:
                                redones.append(d)
                        else:
                            redones.append(d)
                    dones = list(set(redones))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay > transfer_timeout and rate < transfer_lowrate:
                        if len(dones) > 1:
                            ## its the destination that sucks
                            bad_destinations[destination].add(block)
                        else:
                            dum = [bad_sources[d].add(block) for d in dones]
                        really_stuck_dataset.add(dataset)
                        print "add", dataset, "to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n" % (
                            block, destination, ", ".join(dones), rate, delay)
    print "\n" * 2

    ## create tickets right away ?
    report += "\nbad sources " + ",".join(bad_sources.keys()) + "\n"
    for site, blocks in bad_sources.items():
        report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks))
    report += "\nbad destinations " + ",".join(bad_destinations.keys()) + "\n"
    for site, blocks in bad_destinations.items():
        report += "\n\n%s:" % site + "\n\t".join([''] + list(blocks))

    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    print '\n' * 2, "report written at %s/logs/incomplete_transfers.log" % unified_url
    print report

    missing_in_action = json.loads(
        eosRead('%s/incomplete_transfers.json' % monitor_dir))
    stuck_transfers = dict([(k, v) for (k, v) in missing_in_action.items()
                            if k in really_stuck_dataset])
    print '\n' * 2, 'Stuck dataset transfers'
    print json.dumps(stuck_transfers, indent=2)
    eosFile('%s/stuck_transfers.json' % monitor_pub_dir,
            'w').write(json.dumps(stuck_transfers, indent=2)).close()
    eosFile('%s/logs/incomplete_transfers.log' % monitor_dir,
            'w').write(report).close()
Beispiel #12
0
from utils import siteInfo, getWorkflowByInput, getWorkflowByOutput, getWorkflowByMCPileup, monitor_dir, monitor_pub_dir, eosRead, eosFile
import sys
import time
import random 
from collections import defaultdict

url = 'cmsweb.cern.ch'


print time.asctime(time.gmtime())

if sys.argv[1] == 'parse':
    force = False
    if len(sys.argv)>2:
        force = bool(sys.argv[2])
    locks = json.loads(eosRead('%s/globallocks.json'%monitor_pub_dir))
    #waiting = json.loads(open('%s/waiting_custodial.json'%monitor_dir).read())
    #stuck = json.loads(open('%s/stuck_custodial.json'%monitor_pub_dir).read())
    #missing = json.loads(open('%s/missing_approval_custodial.json'%monitor_dir).read())

    waiting = {}
    stuck = {}
    missing = {} 
    si = siteInfo()
    remainings={}
    sis = si.disk.keys()
    random.shuffle( sis )
    for site in sis:
        space = si.disk[site]
        if space: 
            continue
Beispiel #13
0
def stuckor(url = reqmgr_url):
    mlock = moduleLock()
    if mlock(): return
    TD = transferDataset()
    datasets_by_phid = TD.content()
    really_stuck_dataset = set(json.loads(eosRead('%s/really_stuck_dataset.json'%base_eos_dir)))

    UC = unifiedConfiguration()

    print "make a report of stuck transfers"
    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid,datasets in datasets_by_phid.items():
        issues = checkTransferLag( url, phid , datasets=list(datasets) )
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones=[]
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer','MSS').replace('Export','MSS') in dones: 
                                continue
                            else: 
                                redones.append( d )
                        else:
                            redones.append( d )
                    dones = list(set( redones ))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay>transfer_timeout and rate<transfer_lowrate:
                        if len(dones)>1:
                            ## its the destination that sucks
                            bad_destinations[destination].add( block )
                        else:
                            dum=[bad_sources[d].add( block ) for d in dones]
                        really_stuck_dataset.add( dataset )
                        print "add",dataset,"to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay)
    print "\n"*2

    ## create tickets right away ?
    report+="\nbad sources "+",".join(bad_sources.keys())+"\n"
    for site,blocks in bad_sources.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))
    report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n"
    for site,blocks in bad_destinations.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))

    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    print '\n'*2,"report written at %s/logs/incomplete_transfers.log"%unified_url
    print report

    missing_in_action = json.loads(eosRead('%s/incomplete_transfers.json'%monitor_dir))
    stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset])
    print '\n'*2,'Stuck dataset transfers'
    print json.dumps(stuck_transfers , indent=2)
    eosFile('%s/stuck_transfers.json'%monitor_pub_dir,'w').write( json.dumps(stuck_transfers , indent=2) ).close()
    eosFile('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report ).close()
Beispiel #14
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Beispiel #15
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira', False) else None

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    max_per_round = UC.get('max_per_round').get('completor', None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update(
                json.loads(eosRead('%s/stuck_transfers.json' %
                                   monitor_pub_dir)))
            break
        except:
            time.sleep(2)

    for itry in range(5):
        try:
            ## take into account the block that needed to be repositioned recently
            all_stuck.update([
                b.split('#')[0] for b in json.loads(
                    eosRead('%s/missing_blocks.json' % monitor_dir))
            ])
            break
        except:
            time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update(getAllStuckDataset())

    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {}
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print "can truncate complete on"
    print json.dumps(truncate_fractions, indent=2)
    print "can overide on"
    print json.dumps(overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = {
        110000: 21,
        90000: 28,
        #     80000 : 60,
        #0 : 90
    }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip = False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user, spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in [
                    'force-complete', 'completed'
            ]:
                if any(s in wfo.name
                       for s in spec) or (wfo.name in spec) or any(
                           pid in spec for pid in pids) or any(s in pids
                                                               for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url, wfi)
                    skip = True
                    wfi.notifyRequestor(
                        "The workflow %s was force completed by request of %s"
                        % (wfo.name, user),
                        do_batch=False)
                    wfi.sendLog(
                        'completor',
                        '%s is asking for %s to be force complete' %
                        (user, wfo.name))
                    break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask(
        )  ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task, outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask(task)
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(
                    task_campaign, 1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(
                    task_campaign, default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(
                    task_campaign, 1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        priority_log = filter(lambda change: change['Priority'] == priority,
                              wfi.request.get('PriorityTransition', []))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in priority_log]) / (60. * 60. * 24.)
            priority_delay = now - then  ## in days
            print "priority was set to", priority, priority_delay, "[days] ago"

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in running_log]) / (60. * 60. * 24.)
            delay = now - then  ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog(
            'completor',
            "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"
            % (cpuh, delay, priority, priority_delay))
        if priority_delay != 0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url,
                                    original.request['OriginalRequestName'])
        injected_log = filter(
            lambda change: change["Status"] in ["assignment-approved"],
            original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.)
            injection_delay = now - injected_on

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority

        pop_a_jira = False
        ping_on_jira = 7 * (24 * 60 * 60)  # 7 days
        for jp, jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j, reopened, just_created = JC.create_or_last(
                prepid=wfi.request['PrepID'],
                priority=wfi.request['RequestPriority'],
                label='Late',
                reopen=True)
            last_time = JC.last_time(j)
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d" % (
                    delay, priority)
                JC.comment(j.key, j_comment)

        if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000  ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority'] + int(
                (delay_priority_increase *
                 (delay_for_priority_increase - injection_delay_threshold) / 7)
                / quantized) * quantized
            tail_cutting_priority += 101  ## to signal it is from this mechanism
            tail_cutting_priority = min(
                400000, tail_cutting_priority)  ## never go above 400k priority
            tail_cutting_priority = max(
                tail_cutting_priority,
                priority)  ## never go below the current value

            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, increasing to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'bumping priority to %d for being injected since %s' %
                        (tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(
                        url, wfo.name, tail_cutting_priority)
                    max_priority -= 1
                else:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, would like to increase to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'would like to bump priority to %d for being injected since %s'
                        % (tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _, prim, _, _ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck:
            wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c, 14) for c in campaigns])

        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay:
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = {
            "delay": delay,
            "injection_delay": injection_delay
        }

        percent_completions = wfi.getCompletionFraction(caller='completor')

        if not percent_completions:
            sendLog('completor',
                    '%s has no output at all' % wfo.name,
                    level='critical')
            continue

        is_over_allowed_delay = (all([
            percent_completions[out] >= good_fraction_per_out.get(out, 1000.)
            for out in percent_completions
        ]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([
            percent_completions[out] >= truncate_fraction_per_out.get(
                out, 1000.) for out in percent_completions
        ])) and delay >= allowed_delay)
        is_over_completion = (all([
            percent_completions[out] >= good_fraction_nodelay_per_out.get(
                out, 1000.) for out in percent_completions
        ]))

        if is_over_completion:
            wfi.sendLog(
                'completor', "all is over completed %s\n %s" %
                (json.dumps(good_fraction_nodelay_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_allowed_delay:
            wfi.sendLog(
                'completor', "all is above %s \n%s" %
                (json.dumps(good_fraction_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_truncation_delay:
            wfi.sendLog(
                'completor',
                "all is above %s truncation level, and the input is stuck\n%s"
                % (json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(percent_completions, indent=2)))

        else:
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog(
                'completor',
                "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"
                % (json.dumps(percent_completions, indent=2),
                   json.dumps(good_fraction_per_out, indent=2),
                   json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(long_lasting[wfo.name]['agents'], indent=2)))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then

        ran_at = wfi.request['SiteWhitelist']

        wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay))

        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force > 0:
            print "going for force-complete of", wfo.name
            if not safe_mode:
                forceComplete(url, wfi)
                set_force_complete.add(wfo.name)
                wfi.sendLog('completor', 'going for force completing')
                wfi.notifyRequestor(
                    "The workflow %s was force completed for running too long"
                    % wfo.name)
                max_force -= 1
            else:
                sendEmail(
                    'completor',
                    'The workflow %s is ready for force complete, but completor is in safe mode'
                    % wfo.name)
        else:
            wfi.sendLog(
                'completor',
                "too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog(
            'completor', 'The followings were set force-complete \n%s' %
            ('\n'.join(set_force_complete)))

    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text = "These have been running for long"

    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json' % monitor_dir,
            'w').write(json.dumps(long_lasting, indent=2)).close()

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    print text
Beispiel #16
0
now = time.mktime(time.gmtime())

## can we catch the datasets that actually should go to tape ?
custodial_override = {}
for c in CI.campaigns:
    if 'custodial_override' in CI.campaigns[c]:
        custodial_override[c] = CI.campaigns[c]['custodial_override']

newly_locking = set()
also_locking_from_reqmgr = set()

LI = lockInfo()

## add an addHoc list of things to lock. empyting this list would result in unlocking later
addHocLocks = json.loads(eosRead('%s/addhoc_lock.json' % base_eos_dir))

time_point("Starting addhoc")

for item in addHocLocks:
    ds = item.split('#')[0]
    LI.lock(ds, reason='addhoc lock')
    newly_locking.add(ds)

time_point("Starting reversed statuses check")

for status in statuses:
    print time.asctime(time.gmtime()), "CEST, fetching", status
    time_point("checking %s" % status, sub_lap=True)
    wfls = getWorkflows(url, status=status, details=True)
    print len(wfls), "in", status
Beispiel #17
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return 


    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira',False) else None

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]
        

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir)))
            break
        except:
            time.sleep(2)
        
    for itry in range(5):
         try:
             ## take into account the block that needed to be repositioned recently
             all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] )
             break
         except:
             time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = { 110000 : 21,
                                 90000 : 28,
                            #     80000 : 60,
                            #0 : 90
                             }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user,spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in ['force-complete', 'completed']:
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue



        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task,outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask( task )
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60*60*24.)

        priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[]))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.)
            priority_delay = now - then ## in days
            print "priority was set to",priority,priority_delay,"[days] ago"

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.)
            delay = now - then ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay))
        if priority_delay!=0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url, original.request['OriginalRequestName'])
        injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.)
            injection_delay = now - injected_on
        

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority
        
        pop_a_jira = False
        ping_on_jira = 7 *(24*60*60) # 7 days
        for jp,jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'],
                                                    priority = wfi.request['RequestPriority'],
                                                    label = 'Late',
                                                    reopen = True)
            last_time = JC.last_time( j )
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d"%( delay, priority)
                JC.comment(j.key, j_comment)
            

        if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000 ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized
            tail_cutting_priority += 101 ## to signal it is from this mechanism
            tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority
            tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value
            
            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)
                    max_priority-=1
                else:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c,14) for c in campaigns])
            
        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay: 
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = wfi.getCompletionFraction(caller='completor')
        
        if not percent_completions:
            sendLog('completor','%s has no output at all'% wfo.name, level='critical')
            continue

        is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay)
        is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions]))

        if is_over_completion:
            wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ),
                                                                       json.dumps( percent_completions, indent=2 )
                                                                       ))
        elif is_over_allowed_delay:
            wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_over_truncation_delay:
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ),
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), 
                                                                                                 json.dumps(good_fraction_per_out, indent=2),
                                                                                                 json.dumps( truncate_fraction_per_out, indent=2),
                                                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then
            

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force>0:
            print "going for force-complete of",wfo.name
            if not safe_mode:
                forceComplete(url, wfi )
                set_force_complete.add( wfo.name )
                wfi.sendLog('completor','going for force completing')
                wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
                max_force -=1
            else:
                sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name)
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close()

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text
                                                                                                     jobid,
                                                                                                     jobid)
 if not os.path.isdir( condor_dir ):
     print "Ex: ",com
     os.system(com)
     
 #outs = os.popen('find %s -name "*.out"'% ( condor_dir )).read()
 #print outs
 look_in_files = []
 look_in_files.extend( filter(None,os.popen('find %s -name "*.log"'% ( condor_dir )).read().split('\n')) )
 look_in_files.extend( filter(None,os.popen('find %s -name "*.out"'% ( condor_dir )).read().split('\n')) )
 lfs = []
 flfs = []
 for out in look_in_files:
     print "Looking in file",out
     fhr = eosRead( out , trials=2 )
     if not fhr: continue
     for line in fhr.split('\n'):
         if 'logArchive.tar.gz' in line:
             fullpath = filter(lambda w : 'logArchive.tar.gz' in w, line.split())[0]
             alf = fullpath.split('/')[-1].strip()
             if not '/' in fullpath or alf == 'logArchive.tar.gz' or ':' in fullpath:
                 print fullpath,"not satisfactory to find log file name"
                 #sendLog('efficiencor','check on the logs of efficiencor, for %s'%(wf),level='critical')
                 alf = None
                 fullpath = None
                 continue
             print "found log name", alf,fullpath,"in condor log",out.split('/')[-1]
             #print "full name",fullpath
             lfs.append( alf )
             flfs.append( fullpath )
Beispiel #19
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}


    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost_blocks = json.loads(eosRead('%s/lost_blocks_datasets.json'%monitor_dir))
    lost_files = json.loads(eosRead('%s/lost_files_datasets.json'%monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys()+lost_files.keys()):
        b,f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset,"has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f: 
            print dataset,"has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]


    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())


    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']:
            wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus'])
            wfo.status='away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfi.isRelval():
                wfo.status='forget'
                session.commit()
                continue
            else:
                print wfo.name,"is",wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor","%s is in %s, set away"%(wfo.name,wfi.request['RequestStatus']), level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append( (wfo,wfi) )            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            needs[wfo.name].append( dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append( dataset )
            wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset))

    time_point("Check staging workflows")            

    #open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2))
    eosFile('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)).close()
    for prio in needs_by_priority: needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    #open('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2))
    eosFile('%s/dataset_priorities.json'%monitor_dir,'w').write( json.dumps( needs_by_priority , indent=2)).close()
        

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()


    ## first check if anything is inactive
    all_actives = set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip =False
                sendLog('stagor',"\t%s is staging for %s"%(imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor',"setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(set([transfer.phedexid for transfer in session.query(TransferImp).filter(TransferImp.active).all()]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor',"read %s from cache"%phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor','Performing spurious transfer check on %s'% phedexid, level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog('stagor','Stagor has got a skewed input from checkTransferStatus', level='critical')
                checks = {}
                pass
            else:
                TS.add( phedexid, checks)                

        time_point("Check transfer status %s"% phedexid, sub_lap=True)            

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid]=checks[dsname].values()
        if checks:
            sendLog('stagor',"Checks for %s are %s"%( phedexid, [node.values() for node in checks.values()]))
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ",phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][phedexid]=done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s,v in checks[ds].items():
                dataset_endpoints[ds].add( s )

        if done:
            sendLog('stagor',"%s is done"%phedexid)
            TS.add( phedexid, checks)
        else:
            sendLog('stagor',"%s is not finished %s"%(phedexid, pprint.pformat( checks )))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough]
                sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]



                if sites_incomplete_down:
                    sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update( sites_incomplete_down )                    
                if sites_incomplete:
                    endpoint_incompleted[ds].update( sites_incomplete )

            

    time_point("Check on-going transfers")            


    print "End points"
    for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps( dataset_endpoints , indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps( endpoint_in_downtime , indent=2)    

    print "End point incomplete in down time"
    for k in endpoint_incompleted: endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps( endpoint_incompleted , indent=2)        


    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir) ).keys()
    already_stuck.extend( getAllStuckDataset() )
 
    missing_in_action = defaultdict(list)


    print "-"*10,"Checking on workflows in staging","-"*10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda : defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo,wfi in wfois:
        print "#"*30
        time_point("Forward checking %s"% wfo.name,sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys={}
        for need in list(primaries)+list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor',"missing transfer report for %s"%need)
                readys[need] = False      
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report','%s does not have a transfer report'%(need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need)
                readys[need] = True
                done_by_input[need] = { "fake" : True }
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor',"%s is ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = True
            else:
                wfi.sendLog('stagor',"%s is not ready \n%s.\nWaiting for the following:"%(need,json.dumps( done_by_input[need] , indent=2)))
                for request_id in done_by_input[need]:
                    if not done_by_input[need][request_id]:
                        wfi.sendLog('stagor',"https://cmsweb.cern.ch/phedex/prod/Request::View?request={}".format(request_id))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',"all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor',"all needs are fullfilled, already")
                print json.dumps( readys, indent=2 )
        else:
            wfi.sendLog('stagor',"missing requirements")
            copies_needed_from_CPUh, _ = wfi.getNCopies()
            
            copies_needed = copies_needed_from_CPUh
            if 'Campaign' in wfi.request and wfi.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfi.request['Campaign']]:
                copies_needed_from_campaign = CI.campaigns[wfi.request['Campaign']]['maxcopies']
                copies_needed = min(copies_needed_from_campaign, copies_needed_from_CPUh)

            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):#+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime\n%s"%(need, endpoint_in_downtime[need] ))
                    re_transfer=True
                
                if not se_allowed_key in available_cache[need]:
                    available_cache[need][se_allowed_key]  = getDatasetBlocksFraction( url , need, sites=se_allowed )
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need)
                        jump_ahead = True
                    else:
                        wfi.sendLog('stagor',"Available {} times. {} needed".format(available_cache[need][se_allowed_key], copies_needed))
                        missing_and_downtime = list(set(endpoint_in_downtime[need]) & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog('stagor',"%s is incomplete at %s which is in downtime, trying to move along"%(need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog('stagor',"continue waiting for transfers for optimum production performance.")



            ## compute a time since staging to filter jump starting ?                    
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries)+list(secondaries):
                if need in already_stuck: 
                    wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need)
                    jump_ahead = True
                    
            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead"%wfo.name
                details_text += '\n%s wants %s copies'%(wfo.name,copies_needed)
                copies_needed = max(1,copies_needed-1)
                details_text += '\nlowering by one unit to %s'%copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True
                
                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update( presence.keys() )
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog('stagor', "%s is available %s times (%s), at %s"%( need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog('stagor',"%s is this much transfered %s"%(need, json.dumps(done_by_input[need], indent=2)))
                    all_check&= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:    
                    wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name,"has to wait a bit more"
                    wfi.sendLog('stagor',"needs to wait a bit more")
            else:
                wfi.sendLog('stagor',"not checking availability")

            if re_transfer:
                wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime")
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add( wfo.name )


    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical')

    print "-"*10,"Checking on non-available datasets","-"*10    
    ## now check on those that are not fully available
    
    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min( available_cache[dsname].values() )

    really_stuck_dataset = set()

    for dsname,available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical')
                    else:
                        print "would send",wf.name,"back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete",dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) if (not dsname.endswith('/RAW')) else ([],[])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names)
                print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss)
                if fraction_loss > 0.05: ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos: 
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog('stagor', '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical')
                        known_lost_blocks[dsname] = [i['name'] for i in lost_blocks]
                really_stuck_dataset.add( dsname )
                  
            if lost_files:
                fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names)
                print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss
                
                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove    
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical')
                        known_lost_files[dsname] = [i['name'] for i in lost_files]

                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",len(done_by_input[dsname])
            print "\tgot",done_by_input[dsname].values().count(True)
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )
        


    eosFile('%s/lost_blocks_datasets.json'%monitor_dir,'w').write( json.dumps( known_lost_blocks, indent=2)).close()
    eosFile('%s/lost_files_datasets.json'%monitor_dir,'w').write( json.dumps( known_lost_files, indent=2)).close()
    eosFile('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ).close()

    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )


    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add( k , list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json'%base_eos_dir,'w').write( json.dumps(list(really_stuck_dataset), indent=2 )).close()
    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    #############
    ## not going further for what matters
    #############
    return