Example #1
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Example #2
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock('assignor'): return

    CI = campaignInfo()
    SI = siteInfo()

    wfos = []
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name == specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(
                Workflow.status == 'considered').all()
            wfos.extend(
                session.query(Workflow).filter(
                    Workflow.status == 'staging').all())
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == 'staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print wfo.name, "to be assigned"
        wfh = workflowInfo(url, wfo.name)

        ## check if by configuration we gave it a GO
        if not CI.go(wfh.request['Campaign']) and not options.go:
            print "No go for", wfh.request['Campaign']
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            print wfo.name, wfh.request['RequestStatus'], "skipping"
            if not options.test:
                continue

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                continue

        (lheinput, primary, parent, secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList(
            (lheinput, primary, parent, secondary))
        print "Allowed", sites_allowed
        sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
        sites_custodial = []
        if len(sites_custodial) == 0:
            print "No custodial, it's fine, it's covered in close-out"

        if len(sites_custodial) > 1:
            print "more than one custodial for", wfo.name
            sys.exit(36)

        secondary_locations = None
        for sec in list(secondary):
            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items() if there
            ]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            sites_allowed = [
                site for site in sites_allowed if any([
                    osite.startswith(site) for osite in one_secondary_locations
                ])
            ]

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence(url, prim)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed])
            sites_all_data = [
                site for site in sites_with_data if any([
                    osite.startswith(site) for osite in [
                        psite for (psite, (there, frac)) in presence.items()
                        if there
                    ]
                ])
            ]
            sites_with_data = [
                site for site in sites_with_data if any([
                    osite.startswith(site) for osite in [
                        psite for (psite, frac) in presence.items()
                        if frac[1] > 90.
                    ]
                ])
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if any([osite.startswith(site) for osite in presence.keys()])
            ]
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        ## opportunistic running where any piece of data is available
        if secondary_locations and primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            opportunistic_sites = [
                SI.SE_to_CE(site) for site in list((set(secondary_locations)
                                                    & set(primary_locations)) -
                                                   set(sites_allowed))
            ]
            print "We could be running at", opportunistic_sites, "in addition"

        if available_fractions and not all(
            [available >= 1. for available in available_fractions.values()]):
            print "The input dataset is not located in full at any site"
            print json.dumps(available_fractions)
            if not options.test and not options.go: continue  ## skip skip skip
        copies_wanted = 2.
        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            print "The input dataset is not available", copies_wanted, "times, only", available_fractions.values(
            )
            if not options.go:
                continue

        ## default back to white list to original white list with any data
        print "Allowed", sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data", sites_allowed

        if options.restrict:
            print "Allowed", sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected", sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for", list(
                    set(sites_allowed) - set(sites_with_data)), "?"
                print "Whitelist site with any data", list(
                    set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name, "cannot be assign with no matched sites"
            continue

        parameters = {
            'SiteWhitelist': sites_allowed,
            'CustodialSites': sites_custodial,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': '/store/mc',  ## to be figured out ! from Hi shit
            'ProcessingVersion': version,
        }

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if ',' in v: parameters[key] = filter(None, v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request['Campaign']))

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            ## needs to go to event based ? fail for now
            print "Falling back to event splitting ?"
            #parameters['SplittingAlgorithm'] = 'EventBased'
            continue

        ## plain assignment here
        team = 'production'
        if options and options.team:
            team = options.team
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
Example #3
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).filter(
                ~Workflow.status.contains('custodial')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    max_per_round = UC.get('max_per_round').get('transferor', None)

    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    all_to_include = session.query(Workflow).filter(
        Workflow.status.startswith('considered')).all()
    if len(cache) > 2000:
        max_to_include = max_per_round
        random.shuffle(cache)  ## randomize first by wf name
        cache = sorted(cache, key=lambda r: r['RequestPriority'],
                       reverse=True)  ## order by prio
        highest = [r['RequestName'] for r in cache[:max_to_include]]
        all_to_include = [wfo for wfo in all_to_include if wfo.name in highest]
        print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len(
            all_to_include)

    for wfo in all_to_include:
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = defaultdict(float)
    ignored_input_sizes = defaultdict(float)
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read())
    stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name, "staging"
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        blocks = wfh.getBlocks()
        for prim in primary:
            ds_s = dss.get(prim, blocks=blocks)
            if prim in stucks:
                wfh.sendLog(
                    'transferor',
                    "%s appears stuck, so not counting it %s [GB]" %
                    (prim, ds_s))
                ignored_input_sizes[prim] = max(ds_s,
                                                ignored_input_sizes[prim])
            else:
                input_sizes[prim] = max(ds_s, input_sizes[prim])
                wfh.sendLog('transferor',
                            "%s needs %s [GB]" % (wfo.name, ds_s))
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    input_blocks = {}
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        blocks = wfh.getBlocks()
        input_blocks[wfo.name] = blocks
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim, blocks=blocks)
            input_sizes[prim] = max(prim_size, input_sizes[prim])
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendLog(
            "transferor",
            "No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request[
                'RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = defaultdict(float)
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                if wfh.isRelval():
                    wfo.status = 'forget'
                else:
                    wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        blocks = input_blocks.get(wfo.name, wfh.getBlocks())
        if blocks:
            print "Reading only", len(blocks), "blocks in input"
        this_load = sum([dss.get(prim, blocks=blocks) for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = (not wfh.isRelval())
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update(CI.campaigns[campaign])
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'transferor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('transferor',
                            'These data tiers %s are not allowed in %s' %
                            (','.join(banned_tier), wfo.name),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('transferor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('transferor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update(allowed_secondary[sec])

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(
                set(sites_allowed) & set(overide_parameters['SiteWhitelist']))
            wfh.sendLog(
                'transferor',
                'Intersecting with the overriding whitelist parameters, allowed sites become {}'
                .format(sites_allowed))

        if no_go:
            continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue
        #    break ## try this for a while to make things faster

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            LI.lock(dataset, reason='staging')

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                if blocks:
                    print "limiting to blocks", "\n".join(sorted(blocks))
                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                #prim_destination = [site for site in destinations.keys() if not site in prim_location]
                prim_destination = [
                    site for (site, info) in destinations.items()
                    if info['data_fraction'] == 1 and info['completion'] != 100
                ]
                ## veto the site with no current disk space, for things that are not relval
                prim_destination = [
                    site for site in prim_destination
                    if (SI.disk[site] or wfh.isRelval())
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "Counting existing copies ; now need %s" % copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:

                    existings = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(latching)).filter(
                            TransferImp.workflow_id == wfo.id).all()
                    if not existings:
                        tri = TransferImp(phedexid=int(latching), workflow=wfo)
                        print "adding", wfo.id, "with phedexid", latching
                        session.add(tri)
                    else:
                        for existing in existings:
                            existing.active = True

                    session.flush()

                    can_go = False
                    transfer_sizes[prim] = max(this_load, transfer_sizes[prim])
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The input is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim))
                    sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim),
                        level='critical')

                    print json.dumps(prim_to_distribute, indent=2)
                    print json.dumps(prim_location, indent=2)
                    print json.dumps(prim_destination, indent=2)

                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute
                        if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                    ]

                    print "changed to"
                    print json.dumps(prim_to_distribute, indent=2)

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        ## prune the blocks/destination that are already in the making, so that subscription don't overlap
                        for site in spreading:
                            for block in list(spreading[site]):
                                if site in destinations and block in destinations[
                                        site]['blocks'].keys():
                                    ## prune it
                                    spreading[site].remove(block)

                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = max(this_load,
                                                   transfer_sizes[prim])
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination = overide_parameters[
                    'SecondaryLocation']
            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = set(
                        [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if k in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    print sec, json.dumps(destinations, indent=2)
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]  ## this is in SE
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                ## how to make unified understand that it has to wait for the secondary if the sec_destination and

                #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in sec_location
                ]
                #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if not SI.CE_to_SE(site) in sec_destination
                ]
                presitespace_sec_to_distribute = copy.deepcopy(
                    sec_to_distribute)
                #sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                #sec_to_distribute = [site for site in sec_to_distribute if not  SI.CE_to_SE(site) in SI.sites_veto_transfer]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]
                ## at this point you have a problem
                if len(sec_to_distribute) == 0 and len(
                        presitespace_sec_to_distribute):
                    sendLog(
                        'transferor',
                        '%s is getting no possible destinations because of lack of space. To be decided what to do in general'
                        % (sec),
                        level='critical')

                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] *
                                1024.) > sec_size or wfh.isRelval():
                            wfh.sendLog('transferor',
                                        'Sending %s to %s' % (sec, site))
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). %s will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024, wfo.name),
                                    level='critical')
                                wfh.sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024))
                else:
                    ## this is bas overall
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog(
                        'transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
                    #session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(sorted(no_goes)),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        #if site in SI.sites_veto_transfer:
        #    print site,"does not want transfers"
        #    continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue
        transfered_items = defaultdict(set)
        if execute:
            priority = 'normal'
            cds = [
                ds for ds in set(datasets + block_datasets)
                if ds in max_priority
            ]
            ## bucketize the transfers by priority of workflows
            prioritized_items = defaultdict(set)
            for item in items_to_transfer:
                d = item.split('#')[0]
                p = max_priority.get(d, 80000)
                q = 'normal'
                if p > 100000:
                    q = 'reserved'
                elif p < 70000:
                    q = 'low'
                prioritized_items[q].add(item)

            for priority, items in prioritized_items.items():
                result = makeReplicaRequest(url,
                                            site_se,
                                            list(items),
                                            'prestaging',
                                            priority=priority,
                                            approve=True)
                if result:
                    these_transfers = [
                        o['id'] for o in result['phedex']['request_created']
                    ]
                    #phedexids.extend( these_transfers )
                    for ph in these_transfers:
                        transfered_items[ph].update(items)
                else:
                    sendLog(
                        'transferor',
                        'Could not make a replica request for items %s to site %s'
                        % (items, site_se),
                        level='critical')

            #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
            #phedexids = [o['id'] for o in result['phedex']['request_created']]:
        #else:
        #    #result= {'phedex':{'request_created' : []}}
        #    phedexids = []
        #    fake_id-=1

        if not transfered_items:
            sendLog(
                'transferor',
                'Could not make a replica request for items %s to site %s' %
                (items_to_transfer, site),
                level='critical')
            continue
        for phedexid, items in transfered_items.items():
            print phedexid, "transfer created"
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items))):
                for wfid in workflow_dependencies[transfering]:
                    new_transfer = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(phedexid)).filter(
                            TransferImp.workflow_id == wfid).first()
                    if not new_transfer:
                        new_transfer = TransferImp(
                            phedexid=phedexid,
                            workflow=session.query(Workflow).get(wfid))
                        session.add(new_transfer)
                    else:
                        new_transfer.active = True

                    wf_id_in_prestaging.add(wfid)
            #session.commit()

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()
Example #4
0
def new_recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = session.query(Workflow).filter(
        Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend(
            session.query(Workflow).filter(
                Workflow.status == 'assistance-manual').all())

    try:
        from_operator = json.loads(
            os.popen(
                'curl -s http://vocms0113.cern.ch/actions/test.json').read())
        ## now we have a list of things that we can take action on
    except:
        pass

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        if not specific and 'manual' in wfo.status: continue

        wfi = workflowInfo(url, wfo.name)

        send_recovery = False  ## will make all acdc
        send_clone = False  ## will make a clone
        send_back = False  ## should just reject. manual ?
        send_manual = False  ## will set in manual

        where_to_run, missing_to_run = wfi.getRecoveryInfo()

        task_to_recover = where_to_run.keys()

        ## if the site at which the recovery could run in drain or out ?
        for task in task_to_recover:
            not_ready = set(where_to_run[task]) - set(SI.sites_ready)
            if not_ready:
                print "the following sites are not ready for the ACDC", ",".join(
                    sorted(not_ready))
                ## do we have a way of telling if a site is going to be out for a long time ?
                # check on priority: high prio, restart
                if wfi.request['RequestPriority'] >= 85000:
                    send_clone = True
                # check on age of the request
                injection_time = time.mktime(
                    time.strptime(
                        '.'.join(map(str, wfi.request['RequestDate'])),
                        "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
                now = time.mktime(time.gmtime()) / (60. * 60.)
                if float(now - injection_time) < 14.:
                    ## less than 14 days, start over
                    send_clone = True
                else:
                    send_manual = True

        if not send_recovery:
            ## check on whether the stats is very low
            pass

        if send_recovery:
            ## make acdc for all tasks
            for task in task_to_recover:
                actions = list(
                    set([
                        case['solution']
                        for code, case in task_to_recover[task]
                    ]))
                acdc = singleRecovery(url, task, wfi.request, actions, do=True)
        elif send_clone:
            ## this will get it cloned
            wfo.status = 'assistance-clone'
            session.commit()
        elif send_manual:
            wfo.status = 'assistance-manual'
Example #5
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())
    aaa_mapping = json.loads(
        open('%s/equalizor.json' % monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update(
        json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read()))
    all_stuck.update(getAllStuckDataset())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))))
                sendLog(
                    'assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))),
                    level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks +
                                  getDatasetBlocks(dataset, runs=rwl)))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(dataset, lumis=lwl)))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From/after secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = list(
                    set([
                        SI.SE_to_CE(psite)
                        for (psite, (there, frac)) in presence.items() if there
                    ]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = list(
                    set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))

        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Example #6
0
def actor(url,options=None):
    
    if userLock('actor'): return
    
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return
    
   # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    
    # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed
    try:
        action_list = json.loads(os.popen('curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').read())
        ## now we have a list of things that we can take action on
    except:
        print "Not able to load action list :("
        sendLog('actor','Not able to load action list', level='critical')
        return

    print action_list
    if not action_list:
        print "EMPTY!"
        return

    for wfname in action_list:
        print '-'*100
        print "Looking at",wfname,"for recovery options"

        to_clone = False
        to_acdc = False
        for key in action_list[wfname]:
            if key == 'Parameters':
                tasks =  action_list[wfname][key]
            elif key == 'Action' and action_list[wfname][key] == 'acdc':
                print "Going to create ACDCs for ", wfname
                to_acdc = True
            elif key == 'Action' and action_list[wfname][key] == 'clone':
                print "Going to clone ", wfname
                to_clone = True

        if not to_acdc and not to_clone:
            sendLog('actor','Action submitted for something other than acdc and clone for workflow %s'%wfname,level='critical')
            print "Can only do acdcs and clones! Skipping workflow ",wfname
            continue
        if not tasks:
            sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

#===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor','Going to clone %s'%wfname)
            results=[]
            datasets = set(wfi.request['OutputDatasets'])

            comment=""

            if 'comment' in tasks: comment = ", reason: "+ tasks['comment']
            wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment)

            #Reject all workflows in the family
            #first reject the original workflow.
            reqMgrClient.invalidateWorkflow(url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False)
            #Then reject any ACDCs associated with that workflow
            if 'ACDCs' in action_list[wfname]:
                children = action_list[wfname]['ACDCs']
                for child in children:
                    wfi.sendLog('actor',"rejecting %s"%child)
                    wfi_acdc = workflowInfo(url, child)
                    reqMgrClient.invalidateWorkflow(url, wfi_acdc.request['RequestName'], current_status=wfi_acdc.request['RequestStatus'], cascade=False)
                    datasets.update( wfi_acdc.request['OutputDatasets'] )
            #Invalidate all associated output datasets
            for dataset in datasets:
                results.append( setDatasetStatus(dataset, 'INVALID') )

            if all(map(lambda result : result in ['None',None,True],results)):
                wfi.sendLog('actor',"%s and children are rejected"%wfname)

            cloned = None
            try:    
                cloned =  singleClone(url, wfname, tasks, comment, options.do)
            except:
                sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical')
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                remove_action(wfname)
            if not cloned:
                recover = False
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                sendLog('actor','Failed to create clone for %s!'%wfname,level='critical')

            else:
                wfi.sendLog('actor',"Workflow %s cloned"%wfname)


#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                                tasks[task].append({setting:allTasksDefaults[setting]})
            print "Tasks is "
            print tasks

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves
        
            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical')
                continue
            if not WMErr:
                sendLog('actor','Cannot create ACDCS for %s because WMErr is blank.'%wfname,level='critical')
                print "Moving on. WMErr is blank"
                continue

            try:
                where_to_run, missing_to_run,missing_to_run_at =  wfi.getRecoveryInfo()
                print "Where to run = "
                print where_to_run
            except:
                sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""
        
            num_tasks_to_recover = 0
        
            for task in WMErr:
                if 'LogCollect' in task: continue
                if 'Cleanup' in task: continue
                if not 'jobfailed' in WMErr[task]:
                    continue
                else:
                    num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for",wfname
#            recover = False
        
            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
                sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)


#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
        #   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
#                print "Full task name is " + fulltaskname
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in ['Processing','Production','Merge']:
                            wrong_task=True
                            wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites=[SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]))
                    print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname
                print "Going to run at",sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical')
                                wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
#                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else: #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,list(actions)))
                        team = wfi.request['Teams'][0]
                        parameters={
                        'SiteWhitelist' : sorted(assign_to_sites),
                        'AcquisitionEra' : wfi.acquisitionEra(),
                        'ProcessingString' :  wfi.processingString(),
                        'MergedLFNBase' : wfi.request['MergedLFNBase'],
                        'ProcessingVersion' : wfi.request['ProcessingVersion'],
                        }
                    ## hackery for ACDC merge assignment
                        if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary 
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC",acdc
                            parameters['execute']=True
                            wfi.sendLog('actor',"%s  was assigned for recovery"% acdc)
                        else:
                            print "no assignment done with this ACDC",acdc
                            sendLog('actor',"%s needs to be assigned"%(acdc), level='critical')
                            continue
 #                       print parameters
                        result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                        if not result:
                            print acdc,"was not assigned"
                            sendLog('actor',"%s needs to be assigned"%(acdc), level='critical')
                        else:
                            recovering.add( acdc )
                        wfi.sendLog('actor',"ACDCs created for %s"%wfname)
        #===========================================================
        
        
        if recover and options.do:
            remove_action(wfname)

        if message_to_user:
            print wfname,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')



    return
Example #7
0
def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(
            Workflow.status.contains('announce')).filter(
                sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    wfs_n = [w.name for w in wfs]
    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)

    held = set()

    print len(wfs), "closing"
    random.shuffle(wfs)
    max_per_round = UC.get('max_per_round').get('closor', None)
    if options.limit: max_per_round = options.limit

    if max_per_round:
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True),
                               key=lambda r: r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]

        def rank(wfn):
            return all_closedout.index(wfn) if wfn in all_closedout else 0

        wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    for iwfo, wfo in enumerate(wfs):

        if specific and not specific in wfo.name: continue

        print "Progress [%d/%d]" % (iwfo, len(wfs))
        ## what is the expected #lumis
        wfi = workflowInfo(url, wfo.name)
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url,
                                                   batch_name,
                                                   details=True)
                batch_go[batch_name] = all(
                    map(
                        lambda s: not s in [
                            'completed', 'running-open', 'running-closed',
                            'acquired', 'assigned', 'assignment-approved'
                        ], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog(
                    'closor',
                    'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'
                    % (batch_name, batch_name))
                continue

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived'
                                            ] and not options.force:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, wfo.wm_status))
        session.commit()

        if jump_the_line:
            wfi.sendLog('closor', 'Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis'] == 0:
            print wfo.name, "is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(
                Output.datasetname == out).first()
            if not odb:
                print "adding an output object", out
                odb = Output(datasetname=out)
                odb.workflow = wfo
                session.add(odb)
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()
            fraction = lumi_count / float(expected_lumis) * 100.

            completion_line = "%60s %d/%d = %3.2f%%" % (
                out, lumi_count, expected_lumis, fraction)
            wfi.sendLog('closor', "\t%s" % completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                batch_warnings[wfi.getCampaign()].add(completion_line)
            stats[out] = lumi_count
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence(url, out)
            where = [site for site, info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out, "is in full at", ",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites'] + wfi.request[
                    'CustodialSites']
                wfi.sendLog(
                    'closor', "%s is not in full anywhere. send to %s" %
                    (out, ",".join(sorted(going_to))))
                at_destination = dict([(k, v) for (k, v) in presence.items()
                                       if k in going_to])
                else_where = dict([(k, v) for (k, v) in presence.items()
                                   if not k in going_to])
                print json.dumps(at_destination)
                print json.dumps(else_where, indent=2)
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to=there)
                    for l in late_info:
                        l.update({"workflow": wfo.name, "dataset": out})
                    all_late_files.extend(late_info)
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

        ## verify if we have to do harvesting

        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update(OK)

        ## only that status can let me go into announced
        if all(all_OK.values()) and (
            (wfi.request['RequestStatus'] in ['closed-out']) or options.force
                or jump_the_line):
            print wfo.name, "to be announced"
            results = []
            if not results:
                for out in outputs:
                    if out in stats and not stats[out]:
                        continue
                    _, dsn, process_string, tier = out.split('/')

                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if destinations:
                            wfi.sendLog(
                                'closor', '%s to go to %s' %
                                (out, ', '.join(sorted(destinations))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(
                                url,
                                site, [out],
                                'Copy for release validation consumption',
                                priority='normal',
                                approve=True,
                                mail=False,
                                group='RelVal')
                            try:
                                request_id = result['phedex'][
                                    'request_created'][0]['id']
                                results.append(True)
                            except:
                                results.append('Failed relval transfer')

                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 1
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        group_spec = ""  ## not used yet
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending", out, " to DDM"
                            status = pass_to_dynamo(
                                [out],
                                N=n_copies,
                                sites=destinations if destinations else None,
                                group=group_spec if group_spec else None)
                            results.append(status)
                            if status == True:
                                wfi.sendLog(
                                    'closor',
                                    '%s is send to dynamo in %s copies %s %s' %
                                    (out, n_copies, sorted(destinations),
                                     group_spec))
                            else:
                                sendLog('closor',
                                        "could not add " + out +
                                        " to dynamo pool. check closor logs.",
                                        level='critical')
                                wfi.sendLog(
                                    'closor', "could not add " + out +
                                    " to dynamo pool. check closor logs.")
                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(
                            url, wfo.name)
                        if not res in ['None', None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            wfo.wm_status = wl_bis.request['RequestStatus']
                            session.commit()
                            if wl_bis.request['RequestStatus'] in [
                                    'announced', 'normal-archived'
                            ]:
                                res = None
                            else:
                                ## retry ?
                                res = reqMgrClient.announceWorkflowCascade(
                                    url, wfo.name)

                        results.append(res)

            #print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        wfo.status = wfo.status.replace(
                            'announce', 'announced')
                else:
                    wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor', "workflow outputs are announced")
            else:
                wfi.sendLog(
                    'closor', "Error with %s to be announced \n%s" %
                    (wfo.name, json.dumps(results)))

        elif wfi.request['RequestStatus'] in [
                'failed', 'aborted', 'aborted-archived', 'rejected',
                'rejected-archived', 'aborted-completed'
        ]:
            if wfi.isRelval():
                wfo.status = 'forget'
                wfo.wm_status = wfi.request['RequestStatus']
                wfi.sendLog(
                    'closor',
                    "%s is %s, but will not be set in trouble to find a replacement."
                    % (wfo.name, wfo.wm_status))
            else:
                wfo.status = 'trouble'
                wfo.wm_status = wfi.request['RequestStatus']
            session.commit()
        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            held.add(wfo.name)

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        sendLog('closor',
                "the workflows below are held up \n%s" %
                ("\n".join(sorted(held))),
                level='critical')

    #batches = json.loads(open('batches.json').read())
    for bname, go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s" % bname
            issues = ""
            if batch_warnings[bname]:
                issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness
                issues += "\n".join(sorted(batch_warnings[bname]))
                issues += "\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
""" % (bname, bname, issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to)
Example #8
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock('transferor'): return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0, max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"
    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'considered').all():
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority = 0
    min_transfer_priority = 100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
        in_transfer_priority = max(in_transfer_priority,
                                   int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority,
                                    int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    in_transfer_already = sum(input_sizes.values())

    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
    print "... done"

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    for (wfo, wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name, "to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load" % this_load
            print "%15.4f GB already this round" % sum(transfer_sizes.values())
            print "%15.4f GB is the available limit" % transfer_limit
            went_over_budget = True
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over budget"
            else:
                if not options.go:
                    print min_transfer_priority, "minimum priority", wfh.request[
                        'RequestPriority'], "<", in_transfer_priority, "stop"
                    continue

        ## throtlle by campaign go
        if not CI.go(wfh.request['Campaign']):
            print "No go for", wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced = False
        is_real = False
        for b in mcm.getA('batches', query='contains=%s' % wfo.name):
            is_real = True
            if b['status'] == 'announced':
                announced = True
                break

        if not announced:
            print wfo.name, "does not look announced."  # skipping?, rejecting?, reporting?"

        if not is_real:
            print wfo.name, "does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                print "It is too soon to start transfer: %3.2fH remaining" % (
                    now - injection_time)
                continue

        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle
            else:
                print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along
                break

        (lheinput, primary, parent, secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList(
                (lheinput, primary, parent, secondary))

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(
                wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging = False
        if primary:
            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))
                sites_really_allowed = [
                    site for site in sites_allowed if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(
                    0.35 * len(sites_really_allowed)
                ) + 1  ## should just go for a fixed number based if the white list grows that big
                print "Would make", copies_needed, "copies"
                if options.maxcopy > 0:
                    copies_needed = min(options.maxcopy, copies_needed)

                ## remove the sites that do not want transfers
                print "need", copies_needed
                workflow_dependencies[prim].add(wfo.id)
                presence = getDatasetPresence(url, prim)
                prim_location = [
                    site for site, pres in presence.items() if pres[0] == True
                ]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at", len(
                        prim_location), "sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0, copies_needed - len(prim_location))
                print "now need", copies_needed
                subscriptions = listSubscriptions(url, prim)
                prim_destination = list(
                    set([
                        site
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [
                    site for site in prim_destination
                    if not site in prim_location
                ]
                ## add transfer dependencies
                latching_on_transfers = list(
                    set([
                        tid
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and site in prim_destination and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0, copies_needed - len(prim_destination))
                print "then need", copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with", latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [
                    site for site in sites_allowed if not any(
                        [osite.startswith(site) for osite in prim_location])
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any(
                        [osite.startswith(site) for osite in prim_destination])
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        spreading = distributeToSites(getDatasetChops(prim),
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges)
                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            spreading[site] = [prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)

        if secondary:
            if talk:
                print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add(wfo.id)
                presence = getDatasetPresence(url, sec)
                sec_location = [
                    site for site, pres in presence.items() if pres[1] > 90.
                ]  ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions(url, sec)
                sec_destination = [site for site in subscriptions]
                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(sec_to_distribute) > 0:
                    for site in sec_to_distribute:
                        all_transfers[site].append(sec)
                        can_go = False

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name, "latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name, "should just be assigned NOW to", sites_allowed
                wfo.status = 'staged'
            print "setting status to", wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name, "latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to", wfo.status
                    session.commit()
            print wfo.name, "needs a transfer"
            needs_transfer += 1

    #print json.dumps(all_transfers)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to", site, "(CE)", site_se, "(SE) for"
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"

        print "\t", len(blocks), "blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        print "\t", len(blocks), "needed blocks for", list(
            set([block.split('#')[0] for block in blocks]))
        print "\t", len(datasets), "datasets"
        print "\t", datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == phedexid).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Example #9
0
def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return


    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

    jump_the_line = options.announce if options else False
    if jump_the_line:
        wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    held = set()

    print len(wfs),"closing"
    max_per_round = UC.get('max_per_round').get('closor',None)
    if options.limit: max_per_round = options.limit
    random.shuffle( wfs )    
    if max_per_round: wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wfi = workflowInfo(url, wfo.name )
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url , batch_name, details=True)
                batch_go[ batch_name ]  = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name)
                continue


        if wfi.request['RequestStatus'] in  ['announced','normal-archived'] and not options.force:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status))
        session.commit()

        if jump_the_line:
            wfi.sendLog('closor','Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis']==0:
            print wfo.name,"is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda : False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs): 
            print wfo.name,wfi.request['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()
            fraction = lumi_count/float(expected_lumis)*100.

            completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction)
            wfi.sendLog('closor',"\t%s"% completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                batch_warnings[ wfi.getCampaign()].add( completion_line )
            stats[out] = lumi_count
            all_OK[out] = True 


        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence( url, out )
            where = [site for site,info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out,"is in full at",",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
                wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
                at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
                else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
                print json.dumps( at_destination )
                print json.dumps( else_where, indent=2 )
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to = there )
                    for l in late_info:
                        l.update({"workflow":wfo.name,"dataset":out})
                    all_late_files.extend( late_info )
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

    
        ## verify if we have to do harvesting

        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line):
            print wfo.name,"to be announced"
            results=[]
            if not results:
                for out in outputs:
                    if out in stats and not stats[out]: 
                        continue
                    _,dsn,process_string,tier = out.split('/')

                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        
                        if destinations:
                            wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations ))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal')
                            try:
                                request_id =  result['phedex']['request_created'][0]['id']
                                results.append( True )
                            except:
                                results.append( 'Failed relval transfer' )
                        
                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request['Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"):
                            print "tier",tier,"neither TO or NO DDM for",out
                            results.append('Not recognitized tier %s'%tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical')
                            continue

                        n_copies = 2
                        destinations=[]
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]:
                            ddm_instructions = CI.campaigns[campaign]['DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier,indication in ddm_instructions.items():
                                    if ddmtier==tier or ddmtier in ['*','all']:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']
                                            
                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination="+",".join( destinations )
                        group_spec = "" ## not used yet 
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending",out," to DDM"
                            p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec))
                            ddm_text = p.read()
                            print ddm_text
                            status = p.close()
                            if status!=None:
                                print "Failed DDM, retrying to send",out,"a second time"
                                p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec))

                                ddm_text = p.read()
                                print ddm_text
                                status = p.close()    
                                if status!=None:
                                    #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical')
                                    if options.force: status = True
                            results.append( status )
                            if status == None:
                                wfi.sendLog('closor',ddm_text)
                                wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec))
                                                            
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                        if not res in ['None',None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            wfo.wm_status = wl_bis.request['RequestStatus']
                            session.commit()
                            if wl_bis.request['RequestStatus'] in  ['announced','normal-archived']:
                                res = None
                            else:
                                ## retry ?
                                res = reqMgrClient.announceWorkflowCascade(url, wfo.name) 
                            
                        results.append( res )
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        wfo.status = wfo.status.replace('announce','announced')
                else:
                    wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor',"workflow outputs are announced")
            else:
                wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results )))
            
        elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            if wfi.isRelval():
                wfo.status = 'forget'
                wfo.wm_status = wfi.request['RequestStatus']
                wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status))
            else:
                wfo.status = 'trouble'
                wfo.wm_status = wfi.request['RequestStatus']
            session.commit()
        else:
            print wfo.name,"not good for announcing:",wfi.request['RequestStatus']
            wfi.sendLog('closor',"cannot be announced")
            held.add( wfo.name )

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')


    #batches = json.loads(open('batches.json').read())
    for bname,go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s"% bname
            issues=""
            if batch_warnings[ bname ]:
                issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
                issues+="\n".join( sorted( batch_warnings[ bname ] ))
                issues+="\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
"""%( bname, 
      bname,
      issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to )
Example #10
0
def actor(url, options=None):

    if moduleLock(wait=False, silent=True)(): return
    if userLock('actor'): return

    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed
    try:
        action_list = json.loads(
            os.popen(
                'curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').
            read())
        ## now we have a list of things that we can take action on
    except:
        try:
            action_list = json.loads(
                os.popen(
                    'curl -s -k https://vocms0113.cern.ch/getaction?days=15').
                read())
        except:
            print "Not able to load action list :("
            sendLog('actor', 'Not able to load action list', level='critical')
            return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps(action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps(sorted(wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle(wf_list)
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-' * 100
        print "Looking at", wfname, "for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get('Parameters', None)
        to_acdc = action_list[wfname].get('Action', None) == 'acdc'
        to_clone = action_list[wfname].get('Action', None) == 'clone'
        to_force = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['onhold', 'on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog(
                'actor',
                'Action submitted for something other than acdc, clone, bypass or hold for workflow %s'
                % wfname,
                level='critical')
            print json.dumps(action_list[wfname], indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor',
                    'Empty action submitted for workflow %s' % wfname,
                    level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

        #===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor', 'Going to clone %s' % wfname)
            results = []
            datasets = set(wfi.request['OutputDatasets'])

            comment = ""

            if 'comment' in tasks: comment = ", reason: " + tasks['comment']
            wfi.sendLog(
                'actor',
                "invalidating the workflow by traffic controller %s" % comment)

            #Reject all workflows in the family
            #first reject the original workflow.
            reqMgrClient.invalidateWorkflow(
                url,
                wfi.request['RequestName'],
                current_status=wfi.request['RequestStatus'],
                cascade=False)
            #Then reject any ACDCs associated with that workflow
            family = getWorkflowById(url, wfi.request['PrepID'], details=True)
            for fwl in family:
                print "rejecting", fwl['RequestName'], fwl['RequestStatus']
                wfi.sendLog(
                    'actor', "rejecting %s, previous status %s" %
                    (fwl['RequestName'], fwl['RequestStatus']))
                reqMgrClient.invalidateWorkflow(
                    url,
                    fwl['RequestName'],
                    current_status=fwl['RequestStatus'],
                    cascade=False)
                datasets.update(fwl['OutputDatasets'])
            #Invalidate all associated output datasets
            for dataset in datasets:
                results.append(setDatasetStatus(dataset, 'INVALID'))

            if all(map(lambda result: result in ['None', None, True],
                       results)):
                wfi.sendLog('actor', "%s and children are rejected" % wfname)

            cloned = None
            try:
                cloned = singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog(
                    'actor',
                    'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'
                    % wfname,
                    level='critical')
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
                #remove_action(wfname)
            if not cloned:
                recover = False
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                sendLog('actor',
                        'Failed to create clone for %s!' % wfname,
                        level='critical')

            else:
                wfi.sendLog('actor', "Workflow %s cloned" % wfname)

#===========================================================
        elif to_force:
            wfi.sendLog('actor',
                        'Bypassing from workflow traffic controler request')
            forcing = json.loads(
                open(
                    '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json'
                ).read())
            forcing.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json',
                 'w').write(json.dumps(sorted(set(forcing))))
        elif to_hold:
            wfi.sendLog('actor',
                        'Holding on workflow traffic controler request')
            holding = json.loads(
                open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json').
                read())
            holding.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json',
                 'w').write(json.dumps(sorted(set(holding))))
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                            tasks[task].append(
                                {setting: allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because WMErr cannot be reached.'
                    % wfname,
                    level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo(
                )
                print "Where to run = "
                print where_to_run
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because recovery info cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because site list cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""

            num_tasks_to_recover = 0

            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for", wfname
#            recover = False

            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
                ## we do not try to recover pLHE
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because it is a pLHE workflow.'
                    % wfname,
                    level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)

#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
#   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                #                print "Full task name is " + fulltaskname
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in [
                                'Processing', 'Production', 'Merge'
                        ]:
                            wrong_task = True
                            wfi.sendLog(
                                'actor',
                                "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"
                                % (fulltaskname, task_info.taskType))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites = [SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites = list(
                                set([
                                    SI.SE_to_CE(site)
                                    for site in actions[action]
                                ]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(
                        set([SI.SE_to_CE(site)
                             for site in where_to_run[task]]))
                    print "Found", sorted(
                        assign_to_sites
                    ), "as sites where to run the ACDC at, from the acdc doc of ", wfname
                print "Going to run at", sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url,
                                          fulltaskname,
                                          wfi.request,
                                          actions,
                                          do=options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)),
                                    level='critical')
                                wfi.sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
                            #                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else:  #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog(
                            'actor',
                            'ACDC created for task %s. Actions taken \n%s' %
                            (fulltaskname, json.dumps(actions)))
                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters = {
                            'SiteWhitelist': sorted(assign_to_sites),
                            'AcquisitionEra': wfi.acquisitionEra(),
                            'ProcessingString': wfi.processingString(),
                            'MergedLFNBase': wfi.request['MergedLFNBase'],
                            'ProcessingVersion':
                            wfi.request['ProcessingVersion'],
                        }
                        ## hackery for ACDC merge assignment
                        if wfi.request[
                                'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                                    '/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request
                              and wfi.request['TrustSitelists'] == 'true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                    'TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                'TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC", acdc
                            parameters['execute'] = True
                            wfi.sendLog('actor',
                                        "%s  was assigned for recovery" % acdc)
                        else:
                            print "no assignment done with this ACDC", acdc
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                            continue

#                       print parameters
                        result = reqMgrClient.assignWorkflow(
                            url, acdc, team, parameters)
                        if not result:
                            print acdc, "was not assigned"
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                        else:
                            recovering.add(acdc)
                        wfi.sendLog('actor', "ACDCs created for %s" % wfname)
        #===========================================================

        if recover and options.do:
            remove_action(wfname)

        if message_to_user:
            print wfname, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')

    return
Example #11
0
def closor(url, specific=None, options=None):
    if userLock(): return
    mlock  = moduleLock()
    if mlock(): return
    up = componentInfo(soft=['mcm','wtc'])
    if not up.check(): return


    UC = unifiedConfiguration()
    CI = campaignInfo()
    BI = batchInfo()
    CloseI = closeoutInfo()

    all_late_files = []

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    if specific:
        wfs = [wfo for wfo in wfs if specific in wfo.name]
    wfs_n = [w.name for w in wfs]

    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)
    
    held = set()

    print len(wfs),"closing"
    random.shuffle( wfs )    
    max_per_round = UC.get('max_per_round').get('closor',None)
    if options.limit: max_per_round = options.limit

    if max_per_round: 
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key = lambda r : r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]
        def rank( wfn ):
            return all_closedout.index( wfn ) if wfn in all_closedout else 0

        wfs = sorted( wfs, key = lambda wfo : rank( wfo.name ),reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    closers = []

    print len(wfs),"closing"
    th_start = time.mktime(time.gmtime())

    for iwfo,wfo in enumerate(wfs):
        if specific and not specific in wfo.name: continue

        closers.append( CloseBuster(
            wfo = wfo,
            url = url,
            CI = CI,
            UC = UC,
            jump_the_line = jump_the_line,
            batch_goodness = batch_goodness,
            batch_go = batch_go,
            #stats = stats,
            batch_warnings = batch_warnings,
            all_late_files = all_late_files,
            held = held,
            ))

    
    run_threads = ThreadHandler( threads = closers,
                                 n_threads = options.threads,
                                 sleepy = 10,
                                 timeout = None,
                                 verbose = True,
                                 label = 'closor')

    run_threads.start()


    ## waiting on all to complete
    while run_threads.is_alive():
        #print "Waiting on closing threads",time.asctime(time.gmtime())
        time.sleep(5)

    JC = JIRAClient() if up.status.get('jira',False) else None
    print len(run_threads.threads),"finished thread to gather information from"
    failed_threads = 0
    for to in run_threads.threads:
        if to.failed:
            failed_threads += 1
            continue
        if to.outs:
            for outO in to.outs:
                out = outO.datasetname
                odb = session.query(Output).filter(Output.datasetname==out).first()
                if not odb:
                    print "adding an output object",out
                    session.add( outO )
                else:
                    odb.date = outO.date
                
        if to.to_status:
            to.wfo.status = to.to_status
            if JC and to.to_status == "done" and to.wfi:
                jiras = JC.find({"prepid" : to.wfi.request['PrepID']})
                for jira in jiras:
                    JC.close(jira.key)

        if to.to_wm_status:
            to.wfo.wm_status = to.to_wm_status
        if to.closing:
            CloseI.pop( to.wfo.name )

        session.commit()

    th_stop = time.mktime(time.gmtime())

    if wfs:
        time_spend_per_workflow = (th_stop-th_start) / float(len(wfs))
        print "Average time spend per workflow is", time_spend_per_workflow

    if float(failed_threads/run_threads.n_threads) > 0:
        sendLog('checkor','%d/%d threads have failed, better check this out'% (failed_threads, run_threads.n_threads), level='critical')
        sendEmail('checkor','%d/%d threads have failed, better check this out'% (failed_threads,run_threads.n_threads))

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')

    for bname,go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s"% bname
            issues=""
            if batch_warnings[ bname ]:
                issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
                issues+="\n".join( sorted( batch_warnings[ bname ] ))
                issues+="\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
"""%( bname, 
      bname,
      issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to )
            ## just announced ; take it out now.
            BI.pop( bname )


    if os.path.isfile('.closor_stop'):
        print "The loop on workflows was shortened"
        sendEmail('closor','Closor loop was shortened artificially using .closor_stop')
        os.system('rm -f .closor_stop')
Example #12
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock('assignor'): return

    CI = campaignInfo()
    SI = siteInfo()

    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print wfo.name,"to be assigned"
        wfh = workflowInfo( url, wfo.name)


        ## check if by configuration we gave it a GO
        if not CI.go( wfh.request['Campaign'] ) and not options.go:
            print "No go for",wfh.request['Campaign']
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            print wfo.name,wfh.request['RequestStatus'],"skipping"
            if not options.test:
                continue

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                continue

        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print "Allowed",sites_allowed
        sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
        sites_custodial = []
        if len(sites_custodial)==0:
            print "No custodial, it's fine, it's covered in close-out"

        if len(sites_custodial)>1:
            print "more than one custodial for",wfo.name
            sys.exit(36)

        secondary_locations=None
        for sec in list(secondary):
            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence( url, prim )
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] )
            sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_with_any_data = [site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()])]
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        ## opportunistic running where any piece of data is available
        if secondary_locations and primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed))]
            print "We could be running at",opportunistic_sites,"in addition"

        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full at any site"
            print json.dumps(available_fractions)
            if not options.test and not options.go: continue ## skip skip skip
        copies_wanted = 2.
        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values()
            if not options.go:
                continue

        ## default back to white list to original white list with any data
        print "Allowed",sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data",sites_allowed

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            continue

        parameters={
            'SiteWhitelist' : sites_allowed,
            'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit
            'ProcessingVersion' : version,
            }

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if ',' in v: parameters[key] = filter(None,v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            ## needs to go to event based ? fail for now
            print "Falling back to event splitting ?"
            #parameters['SplittingAlgorithm'] = 'EventBased'
            continue

        ## plain assignment here
        team='production'
        if options and options.team:
            team = options.team
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    needing_locks=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            input_sizes[prim] = dss.get( prim )
            print "\t",wfo.name,"needs",input_sizes[prim],"GB"
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())
    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)
    

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: 
                sendEmail("no go for managing","No go for "+wfh.request['Campaign'])
                continue

        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            announced,is_real = check_mcm( wfo.name )

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                if not options.go: break

        if this_load and needs_transfer >= allowed_to_transfer:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer
            else:
                print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer
                if not options.go: continue


        (lheinput,primary,parent,secondary) = wfh.getIO()
        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        ## reduce right away to sites in case of memory limitation
        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",memory_allowed
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        if not sites_allowed:
            print wfo.name,"has no possible sites to run at"
            print "available for",wfh.request['Memory'],"are",memory_allowed
            sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## should make the block selection here
            pass

        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## same, we could be doing the white list here too
            pass


        if blocks:
            print "Reading",len(blocks),"in whitelist"

        can_go = True
        staging=False
        allowed=True
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sorted(sites_allowed)

                copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed_from_site,"copies from site white list"
                copies_needed = copies_needed_from_site

                print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh
                copies_needed = copies_needed_from_CPUh

                if options.maxcopy>0:
                    ## stop maxing things out ??
                    #copies_needed = min(options.maxcopy,copies_needed)
                    #print "Maxed to",copies_needed
                    if copies_needed_from_CPUh > options.maxcopy:
                        sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh))

                
                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,copies_needed_from_site)
                    print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign']

                ## remove the sites that do not want transfers                
                workflow_dependencies[prim].add( wfo.id )

                #####################################
                ###### JR 3/8/15 #### deprecating this
                """
                presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                prim_parts = [site for site,pres in presence.items() if pres[0]==False]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim , sites_allowed )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## remove the subscription where the dataset is in parts at
                #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                """
                ###### JR 3/8/15 #### deprecating this
                #####################################


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps')
                #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]
                ## need to take out the transfer veto
                prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                for dsite in prim_destination:
                    needing_locks[dsite].append( prim )

                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites",prim_location
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    for site in sites_allowed:
                        #increment accross the board, regardless of real destination: could be changed
                        transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available"
                    else:
                        print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    print "selected CE destinations",spreading.keys()
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )

        if not allowed:
            print "Not allowed to move on with",wfo.name
            continue


        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )

                if False:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = destination_cache[sec]
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 

                for site in sec_location:
                    needing_locks[site].append( sec )
                for site in sec_destination:
                    needing_locks[site].append( sec )

                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            passing_along+=1
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1
            passing_along+=1

    print "accumulated locks of dataset in place"
    print json.dumps(needing_locks, indent=2)
    for site,items in needing_locks.items():
        for item in items:
            LI.lock( item, SI.CE_to_SE(site), 'usable input')
        
    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
            #for item in list(set([it.split('#')[0] for it in items_to_transfer])):
            for item in items_to_transfer:
                LI.lock( item, site_se, 'pre-staging')
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Example #14
0
def closor(url, specific=None, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    BI = batchInfo()
    CloseI = closeoutInfo()

    all_late_files = []

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(
            Workflow.status.contains('announce')).filter(
                sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    if specific:
        wfs = [wfo for wfo in wfs if specific in wfo.name]
    wfs_n = [w.name for w in wfs]

    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)

    held = set()

    print len(wfs), "closing"
    random.shuffle(wfs)
    max_per_round = UC.get('max_per_round').get('closor', None)
    if options.limit: max_per_round = options.limit

    if max_per_round:
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True),
                               key=lambda r: r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]

        def rank(wfn):
            return all_closedout.index(wfn) if wfn in all_closedout else 0

        wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_extreme_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    closers = []

    print len(wfs), "closing"
    th_start = time.mktime(time.gmtime())

    for iwfo, wfo in enumerate(wfs):
        if specific and not specific in wfo.name: continue
        if not options.manual and (
                'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159'
                .lower() in (wfo.name).lower() or
                'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986'
                .lower() in (wfo.name).lower()):
            continue
        closers.append(
            CloseBuster(
                wfo=wfo,
                url=url,
                CI=CI,
                UC=UC,
                jump_the_line=jump_the_line,
                batch_goodness=batch_goodness,
                batch_go=batch_go,
                #stats = stats,
                batch_warnings=batch_warnings,
                batch_extreme_warnings=batch_extreme_warnings,
                all_late_files=all_late_files,
                held=held,
            ))

    run_threads = ThreadHandler(threads=closers,
                                n_threads=options.threads,
                                sleepy=10,
                                timeout=None,
                                verbose=True,
                                label='closor')

    run_threads.start()

    ## waiting on all to complete
    while run_threads.is_alive():
        #print "Waiting on closing threads",time.asctime(time.gmtime())
        time.sleep(5)

    JC = JIRAClient() if up.status.get('jira', False) else None
    print len(
        run_threads.threads), "finished thread to gather information from"
    failed_threads = 0
    for to in run_threads.threads:
        if to.failed:
            failed_threads += 1
            continue
        if to.outs:
            for outO in to.outs:
                out = outO.datasetname
                odb = session.query(Output).filter(
                    Output.datasetname == out).first()
                if not odb:
                    print "adding an output object", out
                    session.add(outO)
                else:
                    odb.date = outO.date

        if to.to_status:
            to.wfo.status = to.to_status
            if JC and to.to_status == "done" and to.wfi:
                jiras = JC.find({"prepid": to.wfi.request['PrepID']})
                for jira in jiras:
                    JC.close(jira.key)

        if to.to_wm_status:
            to.wfo.wm_status = to.to_wm_status
        if to.closing:
            CloseI.pop(to.wfo.name)

        session.commit()

    th_stop = time.mktime(time.gmtime())

    if wfs:
        time_spend_per_workflow = (th_stop - th_start) / float(len(wfs))
        print "Average time spend per workflow is", time_spend_per_workflow

    if float(failed_threads / run_threads.n_threads) > 0:
        sendLog('checkor',
                '%d/%d threads have failed, better check this out' %
                (failed_threads, run_threads.n_threads),
                level='critical')
        sendEmail(
            'checkor', '%d/%d threads have failed, better check this out' %
            (failed_threads, run_threads.n_threads))

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        sendLog('closor',
                "the workflows below are held up \n%s" %
                ("\n".join(sorted(held))),
                level='critical')

    for bname, go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s" % bname
            issues = ""
            #if batch_warnings[ bname ]:
            #    issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
            #    issues+="\n".join( sorted( batch_warnings[ bname ] ))
            #    issues+="\n\n"
            if batch_extreme_warnings[bname]:
                subject = "Low Statistics for %s" % bname
                issues = "The following datasets have outstanding completion (<50%%) issues:\n\n"
                issues += "\n".join(sorted(batch_extreme_warnings[bname]))
                issues += "\n\n"
            elif batch_warnings[bname]:
                issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness
                issues += "\n".join(sorted(batch_warnings[bname]))
                issues += "\n\n"
            text = ""
            text += "Dear all,\n\n"
            text += "A batch of release validation workflows has finished.\n\n"
            text += "Batch ID:\n\n"
            text += "%s\n\n" % (bname)
            text += "Detail of the workflows\n\n"
            text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % (
                bname)
            text += "%s\n\n" % (issues)
            text += "This is an automated message.\n\n"
            text += ""
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to)
            ## just announced ; take it out now.
            BI.pop(bname)
            deleteCampaignConfig(bname)

    if os.path.isfile('.closor_stop'):
        print "The loop on workflows was shortened"
        sendEmail('closor',
                  'Closor loop was shortened artificially using .closor_stop')
        os.system('rm -f .closor_stop')
Example #15
0
def checkor(url, spec=None, options=None):
    fDB = closeoutInfo()
    if userLock():
        return
    if duplicateLock():
        return

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=["mcm"])
    if not up.check():
        return
    use_mcm = up.status["mcm"]

    wfs = []
    if options.fetch:
        ## get all in running and check
        wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all())
        wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all())
    if options.nofetch:
        ## than get all in need for assistance
        wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all())

    custodials = defaultdict(list)  # sites : dataset list
    transfers = defaultdict(list)  # sites : dataset list
    invalidations = []  # a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split("/")[2].split("-")[0]
        except:
            if "Campaign" in wfi.request:
                campaign = wfi.request["Campaign"]
        return campaign

    by_passes = []
    holdings = []
    for bypassor, email in [
        ("jbadillo", "*****@*****.**"),
        ("vlimant", "*****@*****.**"),
        ("jen_a", "*****@*****.**"),
    ]:
        bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor)
        if not os.path.isfile(bypass_file):
            print "no file", bypass_file
            continue
        try:
            by_passes.extend(json.loads(open(bypass_file).read()))
        except:
            print "cannot get by-passes from", bypass_file, "for", bypassor
            sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email])

        holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor)
        if not os.path.isfile(holding_file):
            print "no file", holding_file
            continue
        try:
            holdings.extend(json.loads(open(holding_file).read()))
        except:
            print "cannot get holdings from", holding_file, "for", bypassor
            sendEmail(
                "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email]
            )

    total_running_time = 5.0 * 60.0
    sleep_time = max(0.5, total_running_time / len(wfs))

    for wfo in wfs:
        if spec and not (spec in wfo.name):
            continue
        time.sleep(sleep_time)
        print "checking on", wfo.name

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request["RequestStatus"]
        if wfo.wm_status == "closed-out":
            ## manually closed-out
            print wfo.name, "is already", wfo.wm_status
            wfo.status = "close"
            session.commit()
            continue

        elif wfo.wm_status in [
            "failed",
            "aborted",
            "aborted-archived",
            "rejected",
            "rejected-archived",
            "aborted-completed",
        ]:
            ## went into trouble
            wfo.status = "trouble"
            print wfo.name, "is in trouble", wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ["assigned", "acquired"]:
            ## not worth checking yet
            print wfo.name, "not running yet"
            session.commit()
            continue

        if "-onhold" in wfo.status:
            if wfo.name in holdings and wfo.name not in by_passes:
                print wfo.name, "on hold"
                continue

        if wfo.name in holdings and wfo.name not in by_passes:
            wfo.status = "assistance-onhold"
            print "setting", wfo.name, "on hold"
            session.commit()
            continue

        if wfo.wm_status != "completed" and not wfo.name in by_passes:
            ## for sure move on with closeout check if in completed
            print "no need to check on", wfo.name, "in status", wfo.wm_status
            session.commit()
            continue

        session.commit()
        sub_assistance = ""  # if that string is filled, there will be need for manual assistance

        is_closing = True

        ## get it from somewhere
        by_pass_checks = False
        if wfo.name in by_passes:
            print "we can bypass checks on", wfo.name
            by_pass_checks = True
        for bypass in by_passes:
            if bypass in wfo.name:
                print "we can bypass", wfo.name, "because of keyword", bypass
                by_pass_checks = True
                break

        if not CI.go(wfi.request["Campaign"]) and not by_pass_checks:
            print "No go for", wfo.name
            continue

        # tuck out DQMIO/DQM
        wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request["PrepID"], details=True)
        acdc = []
        acdc_inactive = []
        has_recovery_going = False
        had_any_recovery = False
        for member in familly:
            if member["RequestType"] != "Resubmission":
                continue
            if member["RequestName"] == wfo.name:
                continue
            if member["RequestDate"] < wfi.request["RequestDate"]:
                continue
            if member["RequestStatus"] in [
                "running-open",
                "running-closed",
                "assignment-approved",
                "assigned",
                "acquired",
            ]:
                print wfo.name, "still has an ACDC running", member["RequestName"]
                acdc.append(member["RequestName"])
                # print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False
                has_recovery_going = True
            elif member["RequestStatus"] == None:
                print member["RequestName"], "is not real"
                pass
            else:
                acdc_inactive.append(member["RequestName"])
                had_any_recovery = True
        ## completion check
        percent_completions = {}
        #        print "let's see who is crashing", wfo.name
        #        print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        if not "TotalInputEvents" in wfi.request:
            event_expected, lumi_expected = 0, 0
            if not "recovery" in wfo.status:
                sendEmail(
                    "missing member of the request",
                    "TotalInputEvents is missing from the workload of %s" % wfo.name,
                    destination=["*****@*****.**"],
                )
        else:
            event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"]

        if "RequestNumEvents" in wfi.request:
            event_expected = int(wfi.request["RequestNumEvents"])
        elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]:
            event_expected = int(wfi.request["Task1"]["RequestNumEvents"])

        fractions_pass = {}
        for output in wfi.request["OutputDatasets"]:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.0
            if lumi_expected:
                percent_completions[output] = lumi_count / float(lumi_expected)
            if event_expected:
                percent_completions[output] = max(percent_completions[output], event_count / float(event_expected))

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and "fractionpass" in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]["fractionpass"]
                print "overriding fraction to", fractions_pass[output], "for", output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to", fractions_pass[output], "by command line for", output

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name, "is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if has_recovery_going:
                sub_assistance += "-recovering"
            elif had_any_recovery:
                ## we want to have this looked at
                sub_assistance += "-manual"
            else:
                sub_assistance += "-recovery"
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request["OutputDatasets"]:
            events_per_lumi[output] = getDatasetEventsPerLumi(output)

        lumi_upper_limit = {}
        for output in wfi.request["OutputDatasets"]:
            upper_limit = 301.0
            campaign = get_campaign(output, wfi)
            # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]["lumisize"]
                print "overriding the upper lumi size to", upper_limit, "for", campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to", upper_limit, "by command line"

            lumi_upper_limit[output] = upper_limit

        if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name, "has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance += "-biglumi"
            is_closing = False

        any_presence = {}
        for output in wfi.request["OutputDatasets"]:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request["OutputDatasets"]:
            custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence = {}
        for output in wfi.request["OutputDatasets"]:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output)

        vetoed_custodial_tier = UC.get("tiers_with_no_custodial")
        out_worth_checking = [
            out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier
        ]
        size_worth_checking = sum(
            [getDatasetSize(out) / 1023.0 for out in out_worth_checking]
        )  ## size in TBs of all outputs
        if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name, "has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]):
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:", custodial, "because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]["custodial"]
                        print "Setting custodial to", custodial, "from campaign configuration"
                        break
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the unified configuration custodial:", custodial, "because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            if not custodial and "InputDataset" in wfi.request:
                ## this is terribly dangerous to assume only
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"])
                ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset", wfi.request[
                        "InputDataset"
                    ], "does not have custodial in the first place. abort"
                    sendEmail(
                        "dataset has no custodial location",
                        "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"],
                    )
                    is_closing = False
                    pick_custodial = False

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:", custodial, "because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for", wfo.name
                sendEmail(
                    "cannot find a custodial",
                    "cannot find a custodial for %s probably because of the total output size %d"
                    % (wfo.name, size_worth_checking),
                )

            if custodial and ((not sub_assistance and not acdc) or by_pass_checks):
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output] >= 1:
                            custodials[custodial].append(output)
                        else:
                            print "no file in phedex for", output, " not good to add to custodial requests"

            is_closing = False

        ## disk copy
        disk_copies = {}
        for output in wfi.request["OutputDatasets"]:
            disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)]

        if not all(map(lambda sites: len(sites) != 0, disk_copies.values())):
            print wfo.name, "has not all output on disk"
            print json.dumps(disk_copies, indent=2)

        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request["OutputDatasets"]:
            dbs_presence[output] = dbs3Client.getFileCountDataset(output)
            dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True)

        fraction_invalid = 0.01
        if (
            not all(
                [
                    dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out])
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if (
            not all(
                [
                    (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out]))
                    for out in wfi.request["OutputDatasets"]
                ]
            )
            and not options.ignorefiles
        ):
            print wfo.name, "has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance += "-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            print "starting duplicate checker for", wfo.name
            for output in wfi.request["OutputDatasets"]:
                print "\tchecking", output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi(output)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi(output)
                    except:
                        print "was not possible to get the duplicate count for", output
                        is_closing = False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name, "has duplicates"
                print json.dumps(duplications, indent=2)
                ## hook for making file invalidation ?
                sub_assistance += "-duplicates"
                is_closing = False

        ## for visualization later on
        if not wfo.name in fDB.record:
            # print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None}
        fDB.record[wfo.name]["closeOutWorkflow"] = is_closing
        for output in wfi.request["OutputDatasets"]:
            if not output in fDB.record[wfo.name]["datasets"]:
                fDB.record[wfo.name]["datasets"][output] = {}
            rec = fDB.record[wfo.name]["datasets"][output]
            rec["percentage"] = float("%.2f" % (percent_completions[output] * 100))
            rec["duplicate"] = duplications[output] if output in duplications else "N/A"
            rec["phedexReqs"] = (
                float("%.2f" % any_presence[output][custodial_presences[output][0]][1])
                if len(custodial_presences[output]) != 0
                else "N/A"
            )
            rec["closeOutDataset"] = is_closing
            rec["transPerc"] = (
                float("%.2f" % any_presence[output][disk_copies[output][0]][1])
                if len(disk_copies[output]) != 0
                else "N/A"
            )
            rec["correctLumis"] = (
                int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            )
            rec["missingSubs"] = (
                False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output])))
            )
            rec["dbsFiles"] = dbs_presence[output]
            rec["dbsInvFiles"] = dbs_invalid[output]
            rec["phedexFiles"] = phedex_presence[output]
            rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive))

        if by_pass_checks:
            ## force closing
            is_closing = True

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting", wfo.name, "closed-out"
            if not options.test:
                if wfo.wm_status in ["closed-out", "announced", "normal-archived"]:
                    print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer", res
                if not res in ["None", None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)

                if res in [None, "None"]:
                    wfo.status = "close"
                    session.commit()
                else:
                    print "could not close out", wfo.name, "will try again next time"
        else:
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            new_status = "assistance" + sub_assistance
            print wfo.name, "needs assistance with", new_status

            if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status:
                pid = wfi.getPrepIDs()[0].replace("task_", "")
                # pid = wfi.request['PrepID'].replace('task_','')
                ## notify
                messages = {
                    "recovery": "Samples completed with missing statistics:\n%s "
                    % (
                        "\n".join(
                            [
                                "%.2f %% complete for %s" % (percent_completions[output] * 100, output)
                                for output in wfi.request["OutputDatasets"]
                            ]
                        )
                    ),
                    "biglumi": "Samples completed with large luminosity blocks:\n%s "
                    % (
                        "\n".join(
                            [
                                "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output)
                                for output in wfi.request["OutputDatasets"]
                                if (events_per_lumi[output] > lumi_upper_limit[output])
                            ]
                        )
                    ),
                    "duplicate": "Samples completed with duplicated luminosity blocks:\n%s"
                    % (
                        "\n".join(
                            [
                                "%s" % output
                                for output in wfi.request["OutputDatasets"]
                                if output in duplications and duplications[output]
                            ]
                        )
                    ),
                }
                text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name)
                content = ""
                for case in messages:
                    if case in new_status:
                        content += "\n" + messages[case] + "\n"
                text += content
                text += "You are invited to check, while this is being taken care of by Ops.\n"
                text += "This is an automated message."
                if use_mcm and content:
                    print "Sending notification back to requestor"
                    print text
                    batches = mcm.getA("batches", query="contains=%s&status=announced" % pid)
                    if len(batches):
                        ## go notify the batch
                        bid = batches[-1]["prepid"]
                        print "batch nofication to", bid
                        mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid})

                    ## go notify the request
                    print "request notification to", pid
                    mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]})

            ## case where the workflow was in manual from recoveror
            if not "manual" in wfo.status or new_status != "assistance-recovery":
                wfo.status = new_status
                if not options.test:
                    print "setting", wfo.name, "to", wfo.status
                    session.commit()
            else:
                print "current status is", wfo.status, "not changing to anything"

    fDB.html()

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ",".join(custodials[site]), "=>", site
        if not options.test:
            result = makeReplicaRequest(
                url,
                site,
                list(set(custodials[site])),
                "custodial copy at production close-out",
                custodial="y",
                priority="low",
                approve=(site in SI.sites_auto_approve),
            )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ",".join(transfers[site]), "=>", site
        if not options.test:
            result = None
            # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Example #16
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    #if notRunningBefore( 'stagor' ): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = global_SI
    #LI = lockInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n",wfo.name,"\n\tto be assigned"
        wfh = workflowInfo( url, wfo.name)


        ## check if by configuration we gave it a GO
        if not CI.go( wfh.request['Campaign'] ) and not options.go:
            print "No go for",wfh.request['Campaign']
            n_stalled+=1
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                print wfo.name,wfh.request['RequestStatus'],"setting away and skipping"
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                n_stalled+=1
                continue

        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()

        print "Site white list",sorted(sites_allowed)

        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        c_sites_allowed = CI.get(wfh.request['Campaign'], 'SiteWhitelist' , [])
        if c_sites_allowed:
            print "Would like to use the new whitelist, but will not until things went through a bit"
            sendEmail("using a restricted site white list","for %s"%(c_sites_allowed))
            sites_allowed = list(set(sites_allowed) & set(c_sites_allowed))

        c_black_list = CI.get(wfh.request['Campaign'], 'SiteBlacklist', [])
        if c_black_list:
            print "Reducing the whitelist due to black list in campaign configuration"
            print "Removing",c_black_list
            sites_allowed = list(set(sites_allowed) - set(c_black_list))

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']

        ncores = wfh.request.get('Multicore',1)
        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] , maxCore=ncores)
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"MB and",ncores,"core are",memory_allowed
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))
        

        print "Allowed",sorted(sites_allowed)
        secondary_locations=None
        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        print "From secondary requirement, now Allowed",sorted(sites_allowed)

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default
        for prim in list(primary):
            set_lfn = getLFNbase( prim )
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            print "We could be running at",sorted(opportunistic_sites),"in addition"
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                print "One of the destination site is in downtime"
                down_time = True
                ## should this be send back to considered ?
                

        """
        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full over sites"
            print json.dumps(available_fractions)
            if not options.test and not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue ## skip skip skip
        """

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        less_copies_than_requested = UC.get("less_copies_than_requested")
        copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency

        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values()
            if down_time and not options.go:
                wfo.status = 'considered'
                session.commit()
                print "sending back to considered because of site downtime, instead of waiting"
                sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue

        ## default back to white list to original white list with any data
        print "Allowed",sites_allowed
        if options.primary_aaa:
            sites_allowed = initial_sites_allowed
            options.useSiteListAsLocation = True
        else:
            sites_allowed = sites_with_any_data
            print "Selected for any data",sites_allowed

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name))
            n_stalled+=1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]


        print "Placing the output on", sites_out
        parameters={
            'SiteWhitelist' : sites_allowed,
            #'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }


        ## plain assignment here
        team='production'
        if options and options.team:
            team = options.team

        #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000:
        #    team = 'highprio'
        #    sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**'])

        if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]):
            ## consider SDSC
            parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC']
            parameters['useSiteListAsLocation'] = True
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**'])
            
        if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0:
            parameters['SiteWhitelist'] = ['T3_US_SDSC']
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**'])
        

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if type(v)==str and ',' in v: 
                        parameters[key] = filter(None,v.split(','))
                    else: 
                        parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check!=True:
            parameters.update( split_check )
            if 'EventBased' in split_check.values():
                print "Falling back to event splitting."
                sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
            elif 'EventsPerJob' in split_check.values():
                print "Modifying the number of job per event"
                sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    print "There is no go for assigning that request without event splitting"
                    sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    print "need to go down to",eventsPerJob,"events per job"
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        print "need to go down to",lumisPerJob,"in assignment"
                        sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        print "the regular splitting should work for",pstring
                        sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock( secure )
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    print "Assigned",n_assigned
    print "Stalled",n_stalled
Example #17
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    holdings = []
    #try:
    #    already_notified = json.loads(open('already_notifified.json').read())
    #except:
    #    print "no record of already notified workflow. starting fresh"
    #    already_notified = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        bypasses.extend( mcm_force )

    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False
        pids = wfi.getPrepIDs()
        bypass_by_mcm = False
        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
            if bypass in pids:
                wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass))
                bypass_checks = True
                bypass_by_mcm = True
                break
        
        #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks:
        #    print "No go for",wfo.name
        #    wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign'])
        #    continue


        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = int(wfi.request['Task1']['RequestNumEvents'])

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )
            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        print "These %d files are missing in phedex"%(len(missing_phedex))
                        print "\n".join( missing_phedex )
                    if missing_dbs:
                        print "These %d files are missing in dbs"%(len(missing_dbs))
                        print "\n".join( missing_dbs )

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and bypass_by_mcm:
                        ## shoot large on all prepids
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Example #18
0
def closor(url, specific=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')
    ## manually closed-out workflows should get to close with checkor
    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    held = set()

    max_per_round = UC.get('max_per_round').get('closor', None)
    random.shuffle(wfs)
    if max_per_round: wfs = wfs[:max_per_round]

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis
        wfi = workflowInfo(url, wfo.name)
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, wfo.wm_status))
        session.commit()

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(
                Output.datasetname == out).first()
            if not odb:
                print "adding an output object", out
                odb = Output(datasetname=out)
                odb.workflow = wfo
                session.add(odb)
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            wfi.sendLog(
                'closor', "\t%60s %d/%d = %3.2f%%" %
                (out, lumi_count, expected_lumis,
                 lumi_count / float(expected_lumis) * 100.))
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            #fraction = wfo.fraction_for_closing
            #fraction = 0.0
            #all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence(url, out)
            where = [site for site, info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out, "is in full at", ",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites'] + wfi.request[
                    'CustodialSites']
                wfi.sendLog(
                    'closor', "%s is not in full anywhere. send to %s" %
                    (out, ",".join(sorted(going_to))))
                at_destination = dict([(k, v) for (k, v) in presence.items()
                                       if k in going_to])
                else_where = dict([(k, v) for (k, v) in presence.items()
                                   if not k in going_to])
                print json.dumps(at_destination)
                print json.dumps(else_where, indent=2)
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to=there)
                    for l in late_info:
                        l.update({"workflow": wfo.name, "dataset": out})
                    all_late_files.extend(late_info)
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

        ## verify if we have to do harvesting

        (OK, requests) = spawn_harvesting(url, wfi, in_full)
        all_OK.update(OK)

        ## only that status can let me go into announced
        if all(all_OK.values()) and wfi.request['RequestStatus'] in [
                'closed-out'
        ]:
            print wfo.name, "to be announced"
            results = []  #'dummy']
            if not results:
                for out in outputs:
                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True
                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 2
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        ## inject to DDM when necessary
                        if to_DDM:
                            #print "Sending",out," to DDM"
                            p = os.popen(
                                'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'
                                % (n_copies, out, destination_spec))
                            print p.read()
                            status = p.close()
                            if status != None:
                                print "Failed DDM, retrying a second time"
                                p = os.popen(
                                    'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'
                                    % (n_copies, out, destination_spec))
                                print p.read()
                                status = p.close()
                                if status != None:
                                    #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    sendLog('closor',
                                            "could not add " + out +
                                            " to DDM pool. check closor logs.",
                                            level='critical')
                            results.append(status)
                            if status == None:
                                wfi.sendLog(
                                    'closor',
                                    '%s is send to AnalysisOps DDM pool in %s copies %s'
                                    % (n_copies, out, destination_spec))

                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    ## only announce if all previous are fine
                    res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                    if not res in ['None', None]:
                        ## check the status again, it might well have toggled
                        wl_bis = workflowInfo(url, wfo.name)
                        wfo.wm_status = wl_bis.request['RequestStatus']
                        session.commit()
                        if wl_bis.request['RequestStatus'] in [
                                'announced', 'normal-archived'
                        ]:
                            res = None
                        else:
                            ## retry ?
                            res = reqMgrClient.announceWorkflowCascade(
                                url, wfo.name)

                    results.append(res)

            #print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor', "workflow is announced")
            else:
                print "ERROR with ", wfo.name, "to be announced", json.dumps(
                    results)

        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            held.add(wfo.name)

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
        sendLog('closor',
                "the workflows below are held up \n%s" % ("\n".join(held)),
                level='critical')
Example #19
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        if forcings:
            sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if max_per_round and not spec: wfs = wfs[:max_per_round]

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue
            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                    "\n".join( missing_phedex )))
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Example #20
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    #if notRunningBefore( 'stagor' ): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = global_SI
    #LI = lockInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos = []
    if specific or options.early:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered').all())
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == 'staging').all())
    if specific:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered-tried').all())
    wfos.extend(
        session.query(Workflow).filter(Workflow.status == 'staged').all())
    #if specific:
    #    #wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    #    wfos = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    #if not wfos:
    #    if specific:
    #        wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
    #        wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
    #    wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    random.shuffle(wfos)
    for wfo in wfos:
        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)
        wfh.sendLog('assignor', "%s to be assigned" % wfo.name)

        ## check if by configuration we gave it a GO
        if not CI.go(wfh.request['Campaign']) and not options.go:
            wfh.sendLog('assignor', "No go for %s" % wfh.request['Campaign'])
            n_stalled += 1
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None
        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check",
                          "but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default
        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite in SI.sites_not_ready
                        for osite in opportunistic_sites
                    ]))
                down_time = True
                ## should this be send back to considered ?
        """
        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full over sites"
            print json.dumps(available_fractions)
            if not options.test and not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue ## skip skip skip
        """

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                sendEmail(
                    "cannot be assigned due to downtime",
                    "%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."
                    % wfo.name)
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early:
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    sendEmail(
                        "cannot be assigned",
                        "%s is not sufficiently available.\n %s" %
                        (wfo.name, json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))
                n_stalled += 1
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                continue

        ## default back to white list to original white list with any data
        print "Allowed", sites_allowed
        if options.primary_aaa:
            sites_allowed = initial_sites_allowed
            #options.useSiteListAsLocation = True
            options.TrustSitelists = True
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        if options.restrict:
            print "Allowed", sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected", sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for", list(
                    set(sites_allowed) - set(sites_with_data)), "?"
                print "Whitelist site with any data", list(
                    set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            wfh.sendLog('assignor', "cannot be assign with no matched sites")
            sendEmail("cannot be assigned", "%s has no whitelist" % (wfo.name))
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist
        ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            #'CustodialSites' : sites_custodial,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        ## high priority team agent
        #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000:
        #    team = 'highprio'
        #    sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**'])

        ## SDSC redirection
        #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]):
        #    ## consider SDSC
        #    parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC']
        #    parameters['useSiteListAsLocation'] = True
        #    team = 'allocation-based'
        #    sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**'])

        ## SDSC redirection
        #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0:
        #    parameters['SiteWhitelist'] = ['T3_US_SDSC']
        #    team = 'allocation-based'
        #    sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**'])

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT",
                      "%s was assigned to HLT" % wfo.name)

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if type(v) == str and ',' in v:
                        parameters[key] = filter(None, v.split(','))
                    else:
                        parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request['Campaign']))

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check != True:
            parameters.update(split_check)
            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                sendEmail(
                    "Fallback to EventBased",
                    "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"
                    % wfo.name)
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor',
                            "Modifying the number of job per event")
                sendEmail(
                    "Modifying the job per events",
                    "the workflow %s is too heavy in number of jobs explosion"
                    % wfo.name)

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    sendEmail(
                        "issue with event splitting for run-dependent MC",
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        sendEmail(
                            "setting lumi splitting for run-dependent MC",
                            "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        sendEmail("leaving splitting untouched for PU_RD*",
                                  "please check on " + wfo.name)
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock(secure)
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
Example #21
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos = []
    if specific or options.early:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered').all())
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == 'staging').all())
    if specific:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered-tried').all())
    wfos.extend(
        session.query(Workflow).filter(Workflow.status == 'staged').all())

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle(wfos)
    for wfo in wfos:
        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)
        wfh.sendLog('assignor', "%s to be assigned" % wfo.name)

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            n_stalled += 1
            no_go = True

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                allowed_secondary.update(CI.campaigns[campaign]['secondaries'])
        if (secondary and allowed_secondary) and (
                set(secondary) & allowed_secondary != set(secondary)):
            wfh.sendLog(
                'assignor', '%s is not an allowed secondary' %
                (', '.join(set(secondary) - allowed_secondary)))
            #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary)))
            sendLog('assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) - allowed_secondary)),
                    level='critical')
            if not options.go:
                n_stalled += 1
                no_go = True

        if no_go:
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[
                    wfh.request['Campaign']]:
            primary_aaa = primary_aaa or CI.campaigns[
                wfh.request['Campaign']]['primary_AAA']
        secondary_aaa = options.secondary_aaa
        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[
                    wfh.request['Campaign']]:
            secondary_aaa = secondary_aaa or CI.campaigns[
                wfh.request['Campaign']]['secondary_AAA']

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check",
                          "but we cannot yet IMO")
                #pass
            if secondary_aaa:
                #just continue without checking
                continue

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite in SI.sites_not_ready
                        for osite in opportunistic_sites
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial:
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    sendEmail(
                        "cannot be assigned",
                        "%s is not sufficiently available.\n %s" %
                        (wfo.name, json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))
                n_stalled += 1
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if options.partial:
                    print "Will move on with partial locations"
                else:
                    continue

        ## default back to white list to original white list with any data
        print "Allowed", sorted(sites_allowed)

        if primary_aaa:
            sites_allowed = initial_sites_allowed
            options.TrustSitelists = True
            wfh.sendLog(
                'assignor', "Selected to read primary through xrootd %s" %
                sorted(sites_allowed))
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        if secondary_aaa:
            options.TrustPUSitelists = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd from %s" %
                sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if endpoints and options.partial:
            sites_allowed = list(
                set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints]))
            print "with added endpoints", sorted(sites_allowed)

        if not len(sites_allowed):
            wfh.sendLog('assignor', "cannot be assign with no matched sites")
            sendLog('assignor',
                    '%s has no whitelist' % wfo.name,
                    level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT",
                      "%s was assigned to HLT" % wfo.name)

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if type(v) == str and ',' in v:
                        parameters[key] = filter(None, v.split(','))
                    else:
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request['Campaign']))

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check != True:
            parameters.update(split_check)
            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog(
                    'assignor',
                    'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'
                    % wfo.name,
                    level='critical')
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor',
                            "Modifying the number of job per event")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog(
                    'assignor',
                    "the workflow %s is too heavy in number of jobs explosion"
                    % wfo.name,
                    level='critical')

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock(secure)
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
Example #22
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock() and not options.go:  return

    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())
    
    runnings = session.query(Workflow).filter(Workflow.status == 'away').all()
    standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()

    ## intersect with what is actually in completed status in request manager now
    all_completed = set(getWorkflows(url, 'completed' ))

    wfs=[]

    if options.strict:
        ## the one which were running and now have completed
        print "strict option is on: checking workflows that freshly completed"
        wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings))
    if options.update:
        print "update option is on: checking workflows that have not completed yet"
        wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings))

    if options.clear:
        print "clear option is on: checking workflows that are ready to toggle closed-out"
        wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings))
    if options.review:
        print "review option is on: checking the workflows that needed intervention"
        wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings))

    ## what is left out are the wf which were running and ended up aborted/failed/...

    

    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False) if use_mcm else None

    def get_campaign(output, wfi):
        ## this should be a perfect matching of output->task->campaign
        campaign = None
        era = None
        wf_campaign = None
        if 'Campaign' in wfi.request:   wf_campaign = wfi.request['Campaign']
        try:
            era = output.split('/')[2].split('-')[0]
        except:
            era = None
            
        if wfi.isRelval(): 
            campaign = wf_campaign
        else:
            campaign = era if era else wf_campaign
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    
    actors = UC.get('allowed_bypass')

    for bypassor,email in actors:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            extending = json.loads(open(holding_file).read())
            print bypassor,"is holding",extending
            holdings.extend( extending )
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in actors:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        #if forcings:
        #    sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    in_manual = 0

    ## now you have a record of what file was invalidated globally from TT
    TMDB_invalid = dataCache.get('file_invalidation') 
    #try:
    #    TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))])
    #    TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid)
    #    print len(TMDB_invalid),"globally invalidated files"
    #except Exception as e:
    #    print "TMDB not fetched"
    #    print str(e)
    #    TMDB_invalid = []


    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if options.limit: max_per_round=options.limit
    if max_per_round and not spec: wfs = wfs[:max_per_round]



    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        
        time.sleep( sleep_time )
        
        time_point("Starting with %s"% wfo.name)

        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM'))
        campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c 
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        true_familly = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['PrepID'] != wfi.request['PrepID'] : continue
            #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue

            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical')
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue

            true_familly.append( member['RequestName'] )
            #try:
            #    parse_one(url, member['RequestName'])
            #except:
            #    print "Could not make error report for",member['RequestName']

            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))
            sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical')

        time_point("checked workflow familly", sub_lap=True)


        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        events_per_lumi = {}

        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        time_point("execpted statistics", sub_lap=True)

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100
                
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            default_pass = UC.get('default_fraction_pass')
            fractions_pass[output] = default_pass
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                if type(CI.campaigns[c]['fractionpass']) == dict:
                    tier = output.split('/')[-1]
                    priority = str(wfi.request['RequestPriority'])
                    ## defined per tier
                    fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass)
                    if tier in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier]
                    if priority in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority]
                else:
                    fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical')
                #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**'])
                ## do not bypass for now, until Alan understands why we are loosing ACDC docs 
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        time_point("checked output size", sub_lap=True)

        ## correct lumi < 300 event per lumi
        #for output in wfi.request['OutputDatasets']:
        #events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        time_point("checked dataset presence", sub_lap=True)

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        time_point("checked custodiality", sub_lap=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        time_point("checked phedex count", sub_lap=True)


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            group = None
            if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]:
                group = CI.campaigns[campaign]['phedex_group']
                print "using group",group,"for replica"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit")
                
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)


            if custodial and size_worht_going_to_ddm > tape_size_limit:
                print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit))
                custodial = None

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"

                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output))
                            custodials[custodial].append( output )
                            if group: custodials[custodial][-1]+='@%s'%group
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        time_point("determined tape location", sub_lap=True)

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        
        time_point("dbs file count", sub_lap=True)

        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n"
            mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n"
            mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n"
            mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n"

            wfi.sendLog('checkor',mismatch_notice)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                                                                          "\n".join( missing_phedex )))
                        were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
                            sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated),
                                                                                                                          "\n".join(were_invalidated)), level='critical')
                            dbs3Client.setFileStatus( were_invalidated, newstatus=0 )
                                
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))
                        were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False
        
        time_point("checked file count", sub_lap=True)

        fraction_invalid = 0.20
        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        files_per_rl = {}
        for output in wfi.request['OutputDatasets']:
            duplications[output] = "skiped"
            files_per_rl[output] = "skiped"

        time_point("checked invalidation", sub_lap=True)

        if (is_closing or bypass_checks) and (not options.ignoreduplicates):
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                    except Exception as e:
                        wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output))
                        sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical')
                        is_closing=False

            if is_closing and any(duplications.values()) and not options.ignoreduplicates:
                duplicate_notice = ""
                duplicate_notice += "%s has duplicates\n"%wfo.name
                duplicate_notice += json.dumps( duplications,indent=2)
                duplicate_notice += '\n'
                duplicate_notice += json.dumps( files_per_rl, indent=2)
                wfi.sendLog('checkor',duplicate_notice)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 


        time_point("checked duplicates", sub_lap=True)

        time_point("done with %s"%wfo.name)

        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            #rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            rec['familly'] = true_familly
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## make the lumi summary 
        if wfi.request['RequestType'] == 'ReReco':
            try:
                os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID']))
                os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID']))
                wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID']))
            except Exception as e:
                print str(e)
        ## make the error report
        
    
        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            wfi.sendLog('checkor',"setting %s closed-out"% wfo.name)
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            if not 'custodial' in assistance_tags or wfi.isRelval():
                ## do only the report for those
                for member in acdc+acdc_inactive+[wfo.name]:
                    try:
                        parse_one(url, member)
                    except:
                        print "Could not make error report for",member

            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that had ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')
                in_manual += 1
            if 'recovery' in assistance_tags and 'manual' in assistance_tags:
                ## this is likely because something bad is happening, so leave it to manual
                assistance_tags = assistance_tags - set(['recovery'])
                assistance_tags.add('manual')
                in_manual += 1

            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name)
                ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                perflink = '%s/report/%s'%(unified_url,wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status))
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec and in_manual!=0:
        sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        items_at = defaultdict(set)
        for i in custodials[site]:
            item, group = i.split('@') if '@' in i else (i,'DataOps')
            items_at[group].add( item )
        for group,items in items_at.items():
            print ','.join(items),'=>',site,'@',group
            if not options.test:
                result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group)
                print result

    print "File Invalidation"
    print invalidations
Example #23
0
def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return


    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')
    ## manually closed-out workflows should get to close with checkor
    if specific:
        wfs = session.query(Workflow).filter(Workflow.status=='close').filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    held = set()

    print len(wfs),"closing"
    max_per_round = UC.get('max_per_round').get('closor',None)
    if options.limit: max_per_round = options.limit
    random.shuffle( wfs )    
    if max_per_round: wfs = wfs[:max_per_round]

    for wfo in wfs:

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wfi = workflowInfo(url, wfo.name )
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.request['RequestStatus'] in  ['announced','normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status))
        session.commit()


        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis']==0:
            print wfo.name,"is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda : False)
        #print outputs
        if len(outputs): 
            print wfo.name,wfi.request['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.))
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            #fraction = wfo.fraction_for_closing
            #fraction = 0.0
            #all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))
            all_OK[out] = True 


        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence( url, out )
            where = [site for site,info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out,"is in full at",",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites']
                wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to))))
                at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to])
                else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to])
                print json.dumps( at_destination )
                print json.dumps( else_where, indent=2 )
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to = there )
                    for l in late_info:
                        l.update({"workflow":wfo.name,"dataset":out})
                    all_late_files.extend( late_info )
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

    
        ## verify if we have to do harvesting

        if not options.no_harvest:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update( OK )

        ## only that status can let me go into announced
        if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']:
            print wfo.name,"to be announced"
            results=[]#'dummy']
            if not results:
                for out in outputs:
                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request['Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True
                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"):
                            print "tier",tier,"neither TO or NO DDM for",out
                            results.append('Not recognitized tier %s'%tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical')
                            continue

                        n_copies = 2
                        destinations=[]
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]:
                            ddm_instructions = CI.campaigns[campaign]['DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier,indication in ddm_instructions.items():
                                    if ddmtier==tier or ddmtier in ['*','all']:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']
                                            
                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination="+",".join( destinations )
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending",out," to DDM"
                            p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --debug 0 --exec'%(n_copies, out,destination_spec))
                            ddm_text = p.read()
                            print ddm_text
                            status = p.close()
                            if status!=None:
                                print "Failed DDM, retrying to send",out,"a second time"
                                p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --debug 1 --exec'%(n_copies, out,destination_spec))

                                ddm_text = p.read()
                                print ddm_text
                                status = p.close()    
                                if status!=None:
                                    #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical')
                            results.append( status )
                            if status == None:
                                wfi.sendLog('closor',ddm_text)
                                wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec))
                                                            
                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    ## only announce if all previous are fine
                    res = reqMgrClient.announceWorkflowCascade(url, wfo.name)
                    if not res in ['None',None]:
                        ## check the status again, it might well have toggled
                        wl_bis = workflowInfo(url, wfo.name)
                        wfo.wm_status = wl_bis.request['RequestStatus']
                        session.commit()
                        if wl_bis.request['RequestStatus'] in  ['announced','normal-archived']:
                            res = None
                        else:
                            ## retry ?
                            res = reqMgrClient.announceWorkflowCascade(url, wfo.name) 
                            
                    results.append( res )
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor',"workflow is announced")
            else:
                print "ERROR with ",wfo.name,"to be announced",json.dumps( results )
                
        else:
            print wfo.name,"not good for announcing:",wfi.request['RequestStatus']
            wfi.sendLog('closor',"cannot be announced")
            held.add( wfo.name )

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
        sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')
Example #24
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status.startswith('considered')).all():
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read())

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ds_s = dss.get(prim)
            if prim in stucks:
                sendLog('transferor',
                        "%s appears stuck, so not counting it %s [GB]" %
                        (prim, ds_s),
                        wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor',
                        "%s needs %s [GB]" % (wfo.name, ds_s),
                        wfi=wfh)
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging", "no request in staging")
        return
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim)
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor', None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                allowed_secondary.update(CI.campaigns[campaign]['secondaries'])
        if secondary:
            if (secondary and allowed_secondary) and (
                    set(secondary) & allowed_secondary != set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) - allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced = False
            is_real = False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches', query='contains=%s' % wfo.name):
                    is_real = True
                    if b['status'] == 'announced':
                        announced = True
                        break
            except:
                try:
                    for b in mcm.getA('batches',
                                      query='contains=%s' % wfo.name):
                        is_real = True
                        if b['status'] == 'announced':
                            announced = True
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced, is_real

        if not use_mcm:
            announced, is_real = False, True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced, is_real = True, True
            else:
                announced, is_real = check_mcm(wfo.name)

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                wfh.sendLog(
                    'transferor',
                    "It is too soon to start transfer: %3.2fH remaining" %
                    (now - injection_time))
                continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            ## lock everything flat
            NLI.lock(dataset)

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(
                set(blocks +
                    getDatasetBlocks(dataset, lumis=wfh.request['LumiList'])))

        if blocks:
            print "Reading", len(blocks), "in block whitelist"

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                prim_destination = [
                    site for site in destinations.keys()
                    if not site in prim_location
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "not counting existing copies ; now need %s" %
                    copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(
                            Transfer.phedexid == -int(latching)).first()

                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching  ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Not counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The output is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies, but no destinations seems available"
                    )
                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute if not any([
                            osite.startswith(site)
                            for osite in SI.sites_veto_transfer
                        ])
                    ]

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = input_sizes[
                            prim]  ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if site in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] * 1024.) > sec_size:
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor',
                                        '%s is too big (%s) for %s (%s)' %
                                        (sec, sec_size, site_se,
                                         SI.disk[site_se] * 1024),
                                        level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor', "setting status to %s" % wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor',
                                "setting status to %s" % wfo.status)
                    session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(no_goes),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            priority = 'normal'
            cds = [
                ds for ds in datasets + block_datasets if ds in max_priority
            ]
            if cds and False:  ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds] >= 90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds] < 80000 for ds in cds]):
                    priority = 'low'

            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority=priority)
        else:
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(
                    Transfer.phedexid == -int(phedexid)).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            else:
                new_transfer.phedexid = phedexid  ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Example #25
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock('transferor'):   return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0,max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"
    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (_,primary,_,_) = wfh.getIO()
        for prim in primary: 
            input_sizes[prim] = dss.get( prim )
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    in_transfer_already = sum(input_sizes.values())


    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)


    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced=False
        is_real=False
        for b in mcm.getA('batches',query='contains=%s'% wfo.name):
            is_real = True
            if b['status']=='announced': 
                announced=True 
                break

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                break

        (lheinput,primary,parent,secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging=False
        if primary:
            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed,"copies"
                if options.maxcopy>0:
                    copies_needed = min(options.maxcopy,copies_needed)

                ## remove the sites that do not want transfers                
                print "need",copies_needed
                workflow_dependencies[prim].add( wfo.id )
                presence = getDatasetPresence( url, prim )
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0,copies_needed - len(prim_destination))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])]
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: spreading[site]=[prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )




        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )
                presence = getDatasetPresence( url, sec )
                sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions( url ,sec )
                sec_destination = [site for site in subscriptions] 
                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    for site in sec_to_distribute:
                        all_transfers[site].append( sec )
                        can_go = False
        
        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1

    #print json.dumps(all_transfers)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Example #26
0
def recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    use_recoveror = UC.get('use_recoveror')

    if not use_recoveror and not options.go:
        print "We are told not to run recoveror"
        return

    def make_int_keys(d):
        for code in d:
            d[int(code)] = d.pop(code)

    error_codes_to_recover = UC.get('error_codes_to_recover')
    error_codes_to_block = UC.get('error_codes_to_block')
    error_codes_to_notify = UC.get('error_codes_to_notify')
    make_int_keys(error_codes_to_recover)
    make_int_keys(error_codes_to_block)
    make_int_keys(error_codes_to_notify)

    #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all()
    wfs = session.query(Workflow).filter(
        Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend(
            session.query(Workflow).filter(
                Workflow.status == 'assistance-manual').all())

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        if not specific and 'manual' in wfo.status: continue

        wfi = workflowInfo(url, wfo.name)

        ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

        all_errors = {}
        try:
            ## this is clearly very truncated and should be changed completely
            wfi.getSummary()
            all_errors = wfi.summary['errors']
        except:
            pass

        print '-' * 100
        print "Looking at", wfo.name, "for recovery options"

        recover = True

        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('missing lfn', '%s wl cache is screwed up' % wfo.name)
            recover = False

        if not len(all_errors):
            print "\tno error for", wfo.name
            recover = False

        task_to_recover = defaultdict(list)
        message_to_ops = ""
        message_to_user = ""

        if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
            recover = False

        if wfi.request['RequestType'] in ['MonteCarlo', 'ReReco']:
            recover = False

        if 'Campaign' in wfi.request:
            c = wfi.request['Campaign']
            if c in CI.campaigns and 'recover' in CI.campaigns[c]:
                recover = CI.campaigns[c]['recover']

        for task, errors in all_errors.items():
            print "\tTask", task
            ## collect all error codes and #jobs regardless of step at which it occured
            all_codes = []
            for name, codes in errors.items():
                if type(codes) == int: continue
                all_codes.extend([
                    (int(code), info['jobs'], name,
                     list(set([e['type'] for e in info['errors']])),
                     list(set([e['details'] for e in info['errors']])))
                    for code, info in codes.items()
                ])

            all_codes.sort(key=lambda i: i[1], reverse=True)
            sum_failed = sum([l[1] for l in all_codes])

            for errorCode, njobs, name, types, details in all_codes:
                rate = 100 * njobs / float(sum_failed)
                #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name)
                print(
                    "\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s"
                ) % (njobs, "%4.2f" % rate, errorCode, ','.join(types), name)

                added_in_recover = False

                #if options.go:
                # force the recovery of any task with error ?

                if errorCode in error_codes_to_recover:
                    ## the error code is registered
                    for case in error_codes_to_recover[errorCode]:
                        match = case['details']
                        matched = (match == None)
                        if not matched:
                            matched = False
                            for detail in details:
                                if match in detail:
                                    print "[recover] Could find keyword", match, "in"
                                    print 50 * "#"
                                    print detail
                                    print 50 * "#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => we should be able to recover that", case[
                                'legend']
                            task_to_recover[task].append((code, case))
                            added_in_recover = True
                            message_to_user = ""
                        else:
                            print "\t\t recoverable but not frequent enough, needs", case[
                                'rate']

                if errorCode in error_codes_to_block:
                    for case in error_codes_to_block[errorCode]:
                        match = case['details']
                        matched = (match == None)
                        if not matched:
                            matched = False
                            for detail in details:
                                if match in detail:
                                    print "[block] Could find keyword", match, "in"
                                    print 50 * "#"
                                    print detail
                                    print 50 * "#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => that error means no ACDC on that workflow", case[
                                'legend']
                            if not options.go:
                                message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n " % (
                                    wfo.name, errorCode, '#' * 50)
                                recover = False
                                added_in_recover = False

                if errorCode in error_codes_to_notify and not added_in_recover:
                    print "\t\t => we should notify people on this"
                    message_to_user += "%s has an error %s in processing.\n%s\n" % (
                        wfo.name, errorCode, '#' * 50)

        if message_to_user:
            print wfo.name, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
            sendLog('recoveror', message_to_ops, level='warning')

        if len(task_to_recover) != len(all_errors):
            print "Should not be doing partial ACDC. skipping"
            #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name)
            sendLog('recoveror',
                    'do not want to make partial acdc on %s' % wfo.name,
                    level='warning')
            recover = False

        if task_to_recover and recover:
            print "Initiating recovery"
            print ', '.join(task_to_recover.keys()), "to be recovered"

            recovering = set()
            for task in task_to_recover:
                print "Will be making a recovery workflow for", task

                ## from here you can fetch known solutions, to known error codes
                actions = list(
                    set([
                        case['solution']
                        for code, case in task_to_recover[task]
                    ]))
                acdc = singleRecovery(url,
                                      task,
                                      wfi.request,
                                      actions,
                                      do=options.do)

                if not acdc:
                    if options.do:
                        if recovering:
                            print wfo.name, "has been partially ACDCed. Needs manual attention"
                            #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**'])
                            sendLog('recoveror',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfo.name, len(recovering),
                                     len(task_to_recover), list(recovering)),
                                    level='critical')
                            continue
                        else:
                            print wfo.name, "failed recovery once"
                            #break
                            continue
                    else:
                        print "no action to take further"
                        sendLog('recoveror',
                                "ACDC for %s can be done automatically" %
                                wfo.name,
                                level='critical')
                        continue

                ## and assign it ?
                team = wfi.request['Team']
                #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out.
                assign_to_sites = set(SI.all_sites)
                parameters = {
                    #'SiteWhitelist' : wfi.request['SiteWhitelist'],
                    'SiteWhitelist': sorted(assign_to_sites),
                    'AcquisitionEra': wfi.acquisitionEra(),
                    'ProcessingString': wfi.processingString(),
                    'MergedLFNBase': wfi.request['MergedLFNBase'],
                    'ProcessingVersion': wfi.request['ProcessingVersion'],
                }
                ## hackery for ACDC merge assignment
                if wfi.request[
                        'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                            '/')[-1]:
                    parameters['AcquisitionEra'] = None
                    parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                if 'TrustSitelists' in wfi.request and wfi.request[
                        'TrustSitelists']:
                    parameters['TrustSitelists'] = True
                if 'TrustPUSitelists' in wfi.request and wfi.request[
                        'TrustPUSitelists']:
                    parameters['TrustPUSitelists'] = True

                if options.ass:
                    print "really doing the assignment of the ACDC", acdc
                    parameters['execute'] = True
                    wfi.sendLog('recoveror',
                                "%s  was assigned for recovery" % acdc)
                else:
                    print "no assignment done with this ACDC", acdc
                    sendLog('recoveror',
                            "%s needs to be assigned" % (acdc),
                            level='critical')

                result = reqMgrClient.assignWorkflow(url, acdc, team,
                                                     parameters)
                if not result:
                    print acdc, "was not asigned"
                    sendLog('recoveror',
                            "%s needs to be assigned" % (acdc),
                            level='critical')
                else:
                    recovering.add(acdc)

            current = None
            if recovering:
                #if all went well, set the status to -recovering
                current = wfo.status
                if options.ass:
                    current = current.replace('recovery', 'recovering')
                else:
                    current = 'assistance-manual'
                print 'created ACDC: ' + ', '.join(recovering)
            else:
                ## was set to be recovered, and no acdc was made
                current = 'assistance-manual'

            if current:
                print wfo.name, "setting the status to", current
                wfo.status = current
                session.commit()
        else:
            ## this workflow should be handled manually at that point
            print wfo.name, "needs manual intervention"
            wfo.status = 'assistance-manual'
            session.commit()
Example #27
0
def new_recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() )    

    try:
        from_operator = json.loads(os.popen('curl -s http://vocms0113.cern.ch/actions/test.json').read())
        ## now we have a list of things that we can take action on
    except:
        pass




    for wfo in wfs:
        if specific and not specific in wfo.name:continue

        if not specific and 'manual' in wfo.status: continue
        
        wfi = workflowInfo(url, wfo.name)
    
        send_recovery = False ## will make all acdc
        send_clone = False ## will make a clone
        send_back = False ## should just reject. manual ?
        send_manual = False ## will set in manual

        where_to_run, missing_to_run = wfi.getRecoveryInfo()

        task_to_recover = where_to_run.keys()

        ## if the site at which the recovery could run in drain or out ?
        for task in task_to_recover:
            not_ready = set(where_to_run[task]) - set(SI.sites_ready)
            if not_ready:
                print "the following sites are not ready for the ACDC",",".join( sorted(not_ready) )
                ## do we have a way of telling if a site is going to be out for a long time ?
                # check on priority: high prio, restart
                if wfi.request['RequestPriority'] >= 85000:
                    send_clone = True
                # check on age of the request
                injection_time = time.mktime(time.strptime('.'.join(map(str,wfi.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
                now = time.mktime(time.gmtime()) / (60.*60.)
                if float(now - injection_time) <14.:
                    ## less than 14 days, start over
                    send_clone = True
                else:
                    send_manual = True

        
        if not send_recovery:
            ## check on whether the stats is very low
            pass

        if send_recovery:
            ## make acdc for all tasks
            for task in task_to_recover:
                actions = list(set([case['solution'] for code,case in task_to_recover[task]  ]))
                acdc = singleRecovery(url, task, wfi.request , actions, do = True)
        elif send_clone:
            ## this will get it cloned
            wfo.status = 'assistance-clone'
            session.commit()
        elif send_manual:
            wfo.status = 'assistance-manual'
Example #28
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=None
    min_transfer_priority=None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read())
    
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            ds_s = dss.get( prim )
            if prim in stucks: 
                sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh)
        if in_transfer_priority==None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        if min_transfer_priority==None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    if min_transfer_priority==None or in_transfer_priority ==None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging","no request in staging")
        return 
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, ignored_values ) )
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, considered_values) )
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get( prim )
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size( i, j):
        if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) )
        else:
            return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB
    
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor',None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]
    
    for (wfo,wfh) in wfs_and_wfh:
        print wfo.name,"to be transfered with priority",wfh.request['RequestPriority']

        if wfh.request['RequestStatus']!='assignment-approved':
            if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']:
                wfo.status = 'trouble' ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status))
            continue

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go: 
                        wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add( wfo.name )
            
        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
        if secondary:
            if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced,is_real = True,True
            else:
                announced,is_real = check_mcm( wfo.name )

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

            
        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time))
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle))
                else:
                    wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along))
                    if not options.go: 
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer))
                else:
                    wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer))
                    if not options.go: 
                        no_budget = True


        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')


        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if not sites_allowed:
            wfh.sendLog('transferor',"not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) ))

        if blocks:
            print "Reading",len(blocks),"in block whitelist"

        can_go = True
        staging=False
        allowed=True
        primary_destinations = set()
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add( wfo.id )

                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))

                wfh.sendLog('transferor',"Would make %s  from cpu requirement %s"%( copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign, copies_needed)
                    
                    wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign']))


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]


                if len(prim_location) >= copies_needed:
                    wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed)
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute))
                if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority))
                    else:
                        wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first()
                        
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed)                    
                if copies_needed == 0:
                    wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute)==0:
                    wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available")
                    prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                    prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical')
                            wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim))
                            staging=False
                            can_go = False
                    
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys())))
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )
                        transfers_per_sites[site] += 1
                        primary_destinations.add( site ) 
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue


        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination  = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add( wfo.id )

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 


                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination))

                if len( sec_to_distribute )>0:
                    print "secondary could go to",sorted(sec_to_distribute)
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging")
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along+=1
            wfh.sendLog('transferor', "setting status to %s"%wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor', "setting status to %s"%wfo.status)
                    session.commit()
            wfh.sendLog('transferor',"needs a transfer")
            needs_transfer+=1
            passing_along+=1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se)
        

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks'%len(blocks)
        details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets'% len(datasets)
        details_text += '\n\t%s'%sorted(datasets)
        
        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            priority = 'normal'
            cds = [ds for ds in datasets+block_datasets if ds in max_priority]
            if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds]>=90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds]<80000 for ds in cds]):
                    priority = 'low'
                
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority)
        else:
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            else:
                new_transfer.phedexid = phedexid ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Example #29
0
def recoveror(url,specific,options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    use_recoveror = UC.get('use_recoveror')

    if not use_recoveror and not options.go:
        print "We are told not to run recoveror"
        return 

    def make_int_keys( d ):
        for code in d:
            d[int(code)] = d.pop(code)

    error_codes_to_recover = UC.get('error_codes_to_recover')
    error_codes_to_block = UC.get('error_codes_to_block')
    error_codes_to_notify = UC.get('error_codes_to_notify')
    make_int_keys( error_codes_to_recover )
    make_int_keys( error_codes_to_block )
    make_int_keys( error_codes_to_notify )

    #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all()
    wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() )

    for wfo in wfs:
        if specific and not specific in wfo.name:continue

        if not specific and 'manual' in wfo.status: continue
        
        wfi = workflowInfo(url, wfo.name)

        ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

        all_errors = {}
        try:
            ## this is clearly very truncated and should be changed completely
            wfi.getSummary()
            all_errors = wfi.summary['errors']
        except:
            pass

        print '-'*100        
        print "Looking at",wfo.name,"for recovery options"

        recover = True       

        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('missing lfn','%s wl cache is screwed up'%wfo.name)
            recover = False
 
        if not len(all_errors): 
            print "\tno error for",wfo.name
            recover = False

        task_to_recover = defaultdict(list)
        message_to_ops = ""
        message_to_user = ""

        if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
            recover = False

        if wfi.request['RequestType'] in  ['MonteCarlo','ReReco']:
            recover = False

        if 'Campaign' in wfi.request:
            c = wfi.request['Campaign']
            if c in CI.campaigns and 'recover' in CI.campaigns[c]:
                recover=CI.campaigns[c]['recover']

        for task,errors in all_errors.items():
            print "\tTask",task
            ## collect all error codes and #jobs regardless of step at which it occured
            all_codes = []
            for name, codes in errors.items():
                if type(codes)==int: continue
                all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] )

            all_codes.sort(key=lambda i:i[1], reverse=True)
            sum_failed = sum([l[1] for l in all_codes])

            for errorCode,njobs,name,types,details in all_codes:
                rate = 100*njobs/float(sum_failed)
                #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name)
                print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name)
                    
                added_in_recover=False

                #if options.go:
                # force the recovery of any task with error ?

                if errorCode in error_codes_to_recover:
                    ## the error code is registered
                    for case in error_codes_to_recover[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[recover] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => we should be able to recover that", case['legend']
                            task_to_recover[task].append( (code,case) )
                            added_in_recover=True
                            message_to_user = ""
                        else:
                            print "\t\t recoverable but not frequent enough, needs",case['rate']

                if errorCode in error_codes_to_block:
                    for case in error_codes_to_block[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[block] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => that error means no ACDC on that workflow", case['legend']
                            if not options.go:
                                message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 )
                                recover = False
                                added_in_recover=False

                            
                
                if errorCode in error_codes_to_notify and not added_in_recover:
                    print "\t\t => we should notify people on this"
                    message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 )



        if message_to_user:
            print wfo.name,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
            sendLog('recoveror',message_to_ops,level='warning')

        if len(task_to_recover) != len(all_errors):
            print "Should not be doing partial ACDC. skipping"
            #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name)
            sendLog('recoveror','do not want to make partial acdc on %s'%wfo.name, level='warning')
            recover = False

        if task_to_recover and recover:
            print "Initiating recovery"
            print ', '.join(task_to_recover.keys()),"to be recovered"

            recovering=set()
            for task in task_to_recover:
                print "Will be making a recovery workflow for",task

                ## from here you can fetch known solutions, to known error codes
                actions = list(set([case['solution'] for code,case in task_to_recover[task]  ]))
                acdc = singleRecovery(url, task, wfi.request , actions, do = options.do)

                if not acdc:
                    if options.do:
                        if recovering:
                            print wfo.name,"has been partially ACDCed. Needs manual attention"
                            #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**'])
                            sendLog('recoveror', "%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical')
                            continue
                        else:
                            print wfo.name,"failed recovery once"
                            #break
                            continue
                    else:
                        print "no action to take further"
                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfo.name, level='critical')
                        continue
                        
                
                ## and assign it ?
                team = wfi.request['Team']
                #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out.
                assign_to_sites = set(SI.all_sites)
                parameters={
                    #'SiteWhitelist' : wfi.request['SiteWhitelist'],
                    'SiteWhitelist' : sorted(assign_to_sites),
                    'AcquisitionEra' : wfi.acquisitionEra(),
                    'ProcessingString' :  wfi.processingString(),
                    'MergedLFNBase' : wfi.request['MergedLFNBase'],
                    'ProcessingVersion' : wfi.request['ProcessingVersion'],
                    }
                ## hackery for ACDC merge assignment
                if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]:
                    parameters['AcquisitionEra'] = None
                    parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                if 'TrustSitelists' in wfi.request and wfi.request['TrustSitelists']:
                    parameters['TrustSitelists'] = True
                if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']:
                    parameters['TrustPUSitelists'] = True

                if options.ass:
                    print "really doing the assignment of the ACDC",acdc
                    parameters['execute']=True
                    wfi.sendLog('recoveror',"%s  was assigned for recovery"% acdc)
                else:
                    print "no assignment done with this ACDC",acdc
                    sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical')


                result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                if not result:
                    print acdc,"was not asigned"
                    sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical')
                else:
                    recovering.add( acdc )

            current = None
            if recovering:
                #if all went well, set the status to -recovering 
                current = wfo.status 
                if options.ass:
                    current = current.replace('recovery','recovering')
                else:
                    current = 'assistance-manual'
                print 'created ACDC: '+', '.join( recovering )
            else:
                ## was set to be recovered, and no acdc was made
                current = 'assistance-manual'

            if current:
                print wfo.name,"setting the status to",current
                wfo.status = current
                session.commit()
        else:
            ## this workflow should be handled manually at that point
            print wfo.name,"needs manual intervention"
            wfo.status = 'assistance-manual'
            session.commit()
Example #30
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered','staging'])
    if specific:
        fetch_from.extend(['considered-tried'])
    
    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from",fetch_from

    for status in fetch_from:
        wfos.extend(session.query(Workflow).filter(Workflow.status==status).all())

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())
    aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ))
    all_stuck.update( getAllStuckDataset()) 

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle( wfos )
    for wfo in wfos:
        
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)

        if options.priority and int(wfh.request['RequestPriority']) < options.priority:
            continue

        options_text=""
        if options.early: options_text+=", early option is ON"
        if options.partial: 
            options_text+=", partial option is ON"
            options_text+=", good fraction is %.2f"%options.good_enough
        


        wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        
        
        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = False
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update( CI.campaigns[campaign] )

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update( allowed_secondary[sec] )

        if no_go:
            n_stalled+=1 
            continue


            
        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns:
            assign_parameters.update( CI.campaigns[wfh.request['Campaign']] )

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))
            
        do_partial = options.good_enough if options.partial else do_partial


        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass
            if secondary_aaa:
                #just continue without checking
                continue

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off")
                primary_aaa=False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update( aaa_mapping.get(site,[]) )
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed))
                
        if not primary_aaa:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints",sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled+=1
                continue
            
            
        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue


        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low )))
            copies_wanted = max(1., copies_wanted-1.)


        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            above_good = all([available >= do_partial for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                n_stalled+=1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good):
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled+=1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',"cannot be assign with no matched sites")
                sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue


        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed))            

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed))            


        if 'parameters' in assign_parameters:
            parameters.update( assign_parameters['parameters'] )

        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on 
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name)
            

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if type(v)==str and ',' in v: 
                        parameters[key] = filter(None,v.split(','))
                    else: 
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article 
            wfh.sendLog('assignor', 'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        ## pick up campaign specific assignment parameters
        #parameters.update( CI.parameters(wfh.request['Campaign']) )
        parameters.update( assign_parameters.get('parameters',{}) )

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check!=True:
            parameters.update( split_check )
            if 'NoGo' in split_check.values():
                wfh.sendLog('assignor', "Failing splitting check")
                sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical')
                n_stalled+=1
                continue

            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical')
                ## we have a problem here, that EventBased should never be used as a backup
                if not options.go:  
                    n_stalled+=1
                    continue
                continue ## skip all together
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per job")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical')
            elif 'EventsPerLumi' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this")

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")


        
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        #NLI.lock( secure )
                        LI.lock( secure, reason = 'assigning')
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                wfh.sendLog('assignor',"Failed to assign. Please check the logs")
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
Example #31
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock():
        return
    if duplicateLock():
        return
    if not componentInfo().check():
        return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos = []
    if specific or options.early:
        wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all())
        wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all())
    if specific:
        wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all())
    wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all())

    dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read())

    max_per_round = UC.get("max_per_round").get("assignor", None)
    max_cpuh_block = UC.get("max_cpuh_block")
    random.shuffle(wfos)
    for wfo in wfos:
        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(","))):
                continue
            # if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)
        wfh.sendLog("assignor", "%s to be assigned" % wfo.name)

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList()

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            n_stalled += 1
            no_go = True

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]:
                allowed_secondary.update(CI.campaigns[campaign]["secondaries"])
        if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)):
            wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)))
            # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary)))
            sendLog(
                "assignor",
                "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)),
                level="critical",
            )
            if not options.go:
                n_stalled += 1
                no_go = True

        if no_go:
            continue

        ## check on current status for by-passed assignment
        if wfh.request["RequestStatus"] != "assignment-approved":
            if not options.test:
                wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request["RequestStatus"]
                wfo.status = "away"
                session.commit()
                continue
            else:
                print wfo.name, wfh.request["RequestStatus"]

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog("assignor", "cannot decide on version number")
                n_stalled += 1
                wfo.status = "trouble"
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", [])

        blocks = []
        if "BlockWhitelist" in wfh.request:
            blocks = wfh.request["BlockWhitelist"]
        if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"])))

        wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]]
        ):
            primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"]
        secondary_aaa = options.secondary_aaa
        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]]
        ):
            secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"]

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check", "but we cannot yet IMO")
                # pass
            if secondary_aaa:
                # just continue without checking
                continue

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0]
            # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]

        wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = "/store/mc"  ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks
            )
            # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [
                site
                for site in sites_with_data
                if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there]
            ]
            sites_with_data = [
                site
                for site in sites_with_data
                if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0]
            ]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog(
                "assignor",
                "Holding the data but not allowed %s"
                % sorted(
                    list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))
                ),
            )
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site)
                    for site in list(
                        (set(secondary_locations) & set(primary_locations))
                        - set([SI.CE_to_SE(site) for site in sites_allowed])
                    )
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site)
                    for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog(
                    "assignor",
                    "One of the usable site is in downtime %s"
                    % ([osite in SI.sites_not_ready for osite in opportunistic_sites]),
                )
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog("assignor", "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                "assignor",
                "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh),
                level="critical",
            )
            wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "maxcopies" in CI.campaigns[wfh.request["Campaign"]]
        ):
            copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"]
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1, copies_wanted - less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available >= 1.0 for available in available_fractions.values()])
            wfh.sendLog(
                "assignor",
                "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()),
            )
            if down_time and not options.go and not options.early:
                wfo.status = "considered"
                session.commit()
                wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting")
                # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    "assignor",
                    "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered."
                    % (wfo.name),
                    level="delay",
                )
                continue
                # pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open("cannot_assign.json").read())
                except:
                    pass
                if (
                    not wfo.name in known
                    and not options.limit
                    and not options.go
                    and not options.early
                    and not options.partial
                ):
                    wfh.sendLog(
                        "assignor",
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)),
                    )
                    sendEmail(
                        "cannot be assigned",
                        "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)),
                    )
                    known.append(wfo.name)
                    open("cannot_assign.json", "w").write(json.dumps(known, indent=2))
                n_stalled += 1
                if options.early:
                    if wfo.status == "considered":
                        wfh.sendLog("assignor", "setting considered-tried")
                        wfo.status = "considered-tried"
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if options.partial:
                    print "Will move on with partial locations"
                else:
                    continue

        ## default back to white list to original white list with any data
        print "Allowed", sorted(sites_allowed)

        if primary_aaa:
            sites_allowed = initial_sites_allowed
            options.TrustSitelists = True
            wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed))
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed))

        if secondary_aaa:
            options.TrustPUSitelists = True
            wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if endpoints and options.partial:
            sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints]))
            print "with added endpoints", sorted(sites_allowed)

        if not len(sites_allowed):
            wfh.sendLog("assignor", "cannot be assign with no matched sites")
            sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical")
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith("T1")]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]

        wfh.sendLog("assignor", "Placing the output on %s" % sites_out)
        parameters = {
            "SiteWhitelist": sites_allowed,
            "NonCustodialSites": sites_out,
            "AutoApproveSubscriptionSites": list(set(sites_out)),
            "AcquisitionEra": wfh.acquisitionEra(),
            "ProcessingString": wfh.processingString(),
            "MergedLFNBase": set_lfn,
            "ProcessingVersion": version,
        }

        ## plain assignment here
        team = "production"
        if os.getenv("UNIFIED_TEAM"):
            team = os.getenv("UNIFIED_TEAM")
        if options and options.team:
            team = options.team

        if False and "T2_CH_CERN" in parameters["SiteWhitelist"]:
            ## add some check on
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"]
            team = "hlt"
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name)

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if type(v) == str and "," in v:
                        parameters[key] = filter(None, v.split(","))
                    else:
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog("assignor", "Setting the number of events per job to 500k max")
            parameters["EventsPerJob"] = 500000

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request["Campaign"]))

        if not options.test:
            parameters["execute"] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check != True:
            parameters.update(split_check)
            if "EventBased" in split_check.values():
                wfh.sendLog("assignor", "Falling back to event splitting.")
                # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog(
                    "assignor",
                    "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"
                    % wfo.name,
                    level="critical",
                )
            elif "EventsPerJob" in split_check.values():
                wfh.sendLog("assignor", "Modifying the number of job per event")
                # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog(
                    "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical"
                )

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if "PU_RD" in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if "PU_RD2" in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog(
                        "assignor",
                        "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob),
                        level="critical",
                    )
                    wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob))
                    parameters["EventsPerJob"] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None
                    eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog(
                            "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical"
                        )
                        wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob))
                        parameters["LumisPerJob"] = lumisPerJob
                    else:
                        # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            "assignor",
                            "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name),
                            level="critical",
                        )
                        wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.")

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = "away"
                session.commit()
                n_assigned += 1
                wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]:
                        ## lock all outputs flat
                        NLI.lock(secure)
                    # for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
Example #32
0
def recoveror(url,specific,options=None):
    if userLock('recoveror'): return

    up = componentInfo()
    CI = campaignInfo()

    UC = unifiedConfiguration()

    def make_int_keys( d ):
        for code in d:
            d[int(code)] = d.pop(code)

    error_codes_to_recover = UC.get('error_codes_to_recover')
    error_codes_to_block = UC.get('error_codes_to_block')
    error_codes_to_notify = UC.get('error_codes_to_notify')
    make_int_keys( error_codes_to_recover )
    make_int_keys( error_codes_to_block )
    make_int_keys( error_codes_to_notify )

    wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all()
    if specific:
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() )

    for wfo in wfs:
        if specific and not specific in wfo.name:continue

        if not specific and 'manual' in wfo.status: continue
        
        wfi = workflowInfo(url, wfo.name, deprecated=True) ## need deprecated info for mergedlfnbase

        ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

        all_errors = None
        try:
            wfi.getSummary()
            all_errors = wfi.summary['errors']
        except:
            pass

        print '-'*100        
        print "Looking at",wfo.name,"for recovery options"
        
        if not len(all_errors): 
            print "\tno error for",wfo.name

        task_to_recover = defaultdict(list)
        message_to_ops = ""
        message_to_user = ""

        recover=True
        if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
            recover = False

        if 'Campaign' in wfi.request:
            c = wfi.request['Campaign']
            if c in CI.campaigns and 'recover' in CI.campaigns[c]:
                recover=CI.campaigns[c]['recover']

        for task,errors in all_errors.items():
            print "\tTask",task
            ## collect all error codes and #jobs regardless of step at which it occured
            all_codes = []
            for name, codes in errors.items():
                if type(codes)==int: continue
                all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] )

            all_codes.sort(key=lambda i:i[1], reverse=True)
            sum_failed = sum([l[1] for l in all_codes])

            for errorCode,njobs,name,types,details in all_codes:
                rate = 100*njobs/float(sum_failed)
                #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name)
                print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name)
                    
                added_in_recover=False

                #if options.go:
                # force the recovery of any task with error ?

                if errorCode in error_codes_to_recover:
                    ## the error code is registered
                    for case in error_codes_to_recover[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[recover] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => we should be able to recover that", case['legend']
                            task_to_recover[task].append( (code,case) )
                            added_in_recover=True
                            message_to_user = ""
                        else:
                            print "\t\t recoverable but not frequent enough, needs",case['rate']

                if errorCode in error_codes_to_block:
                    for case in error_codes_to_block[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[block] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => that error means no ACDC on that workflow", case['legend']
                            if not options.go:
                                message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 )
                                recover = False
                                added_in_recover=False

                            
                
                if errorCode in error_codes_to_notify and not added_in_recover:
                    print "\t\t => we should notify people on this"
                    message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 )



        if message_to_user:
            print wfo.name,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**','*****@*****.**'])


        if task_to_recover and recover:
            print "Initiating recovery"
            print ', '.join(task_to_recover.keys()),"to be recovered"

            recovering=set()
            for task in task_to_recover:
                print "Will be making a recovery workflow for",task

                ## from here you can fetch known solutions, to known error codes
                actions = list(set([case['solution'] for code,case in task_to_recover[task]  ]))
                acdc = singleRecovery(url, task, wfi.request , actions, do = options.do)

                if not acdc:
                    if options.do:
                        if recovering:
                            print wfo.name,"has been partially ACDCed. Needs manual attention"
                            sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**','*****@*****.**'])
                            continue
                        else:
                            print wfo.name,"failed recovery once"
                            break
                    else:
                        print "no action to take further"
                        sendEmail("an ACDC that can be done automatically","please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details", destination=['*****@*****.**','*****@*****.**'])
                        continue
                        
                
                ## and assign it ?
                team = wfi.request['Teams'][0]
                parameters={
                    'SiteWhitelist' : wfi.request['SiteWhitelist'],
                    'AcquisitionEra' : wfi.acquisitionEra(),
                    'ProcessingString' :  wfi.processingString(),
                    'MergedLFNBase' : wfi.deprecated_request['MergedLFNBase'],
                    'ProcessingVersion' : wfi.request['ProcessingVersion'],
                    }
                
                if options.ass:
                    print "really doing the assignment of the ACDC",acdc
                    parameters['execute']=True
                    sendEmail("an ACDC was done and WAS assigned", "%s  was assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**','*****@*****.**'])
                else:
                    print "no assignment done with this ACDC",acdc
                    sendEmail("an ACDC was done and need to be assigned", "%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**','*****@*****.**'])

                result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                recovering.add( acdc )

            if recovering:
                #if all went well, set the status to -recovering 
                current = wfo.status 
                if options.ass:
                    current = current.replace('recovery','recovering')
                else:
                    current = 'assistance-manual'
                print wfo.name,"setting the status to",current
                print ', '.join( recovering )
                wfo.status = current
                session.commit()
        else:
            ## this workflow should be handled manually at that point
            print wfo.name,"needs manual intervention"
            wfo.status = 'assistance-manual'
            session.commit()
Example #33
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = global_SI
    #LI = lockInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    if specific or options.early:
        wfos.extend( session.query(Workflow).filter(Workflow.status=='considered').all())
        wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
    if specific:
        wfos.extend( session.query(Workflow).filter(Workflow.status=='considered-tried').all())        
    wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())


    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle( wfos )
    for wfo in wfos:
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)
        wfh.sendLog('assignor',"%s to be assigned"%wfo.name)

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()

        ## check if by configuration we gave it a GO
        no_go = False
        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if not CI.go( campaign ):    
                wfh.sendLog('assignor',"No go for %s"%campaign)
                if not options.go:
                    n_stalled+=1
                    no_go = True
                    break
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
        if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)):
            wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)))
            #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary)))
            sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)), level='critical')
            if not options.go:
                n_stalled+=1
                no_go = True
                
        if no_go:
            continue


        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None
        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite in SI.sites_not_ready for osite in opportunistic_sites]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial:
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if options.partial:
                    print "Will move on with partial locations"
                else:
                    continue

        ## default back to white list to original white list with any data
        print "Allowed",sorted(sites_allowed)
        if options.primary_aaa:
            sites_allowed = initial_sites_allowed
            #options.useSiteListAsLocation = True
            options.TrustSitelists = True
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        ### check on endpoints for on-going transfers
        if endpoints and options.partial:
            sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints]))
            print "with added endpoints",sorted(sites_allowed)
            
        #if options.partial:
        #    continue



        if not len(sites_allowed):
            wfh.sendLog('assignor',"cannot be assign with no matched sites")
            #sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name))
            sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
        ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist
        ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            #'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }


        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        ## high priority team agent
        #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000:
        #    team = 'highprio'
        #    sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**'])

        ## SDSC redirection
        #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]):
        #    ## consider SDSC
        #    parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC']
        #    parameters['useSiteListAsLocation'] = True
        #    team = 'allocation-based'
        #    sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**'])
        
        ## SDSC redirection
        #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0:
        #    parameters['SiteWhitelist'] = ['T3_US_SDSC']
        #    team = 'allocation-based'
        #    sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**'])
        

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on 
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name)
            

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if type(v)==str and ',' in v: 
                        parameters[key] = filter(None,v.split(','))
                    else: 
                        parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check!=True:
            parameters.update( split_check )
            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'%wfo.name, level='critical')
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of job per event")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical')

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")


        
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock( secure )
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
Example #34
0
def actor(url, options=None):

    mlock = moduleLock(wait=False, silent=True)
    if mlock(): return
    if userLock('actor'): return

    up = componentInfo(soft=['mcm'])
    if not up.check(): return

    # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    WC = wtcClient()
    WI = wtcInfo()
    JC = JIRAClient()

    action_list = WC.get_actions()
    if action_list is None:
        print "Not able to load action list"
        sendLog('actor', 'Not able to load action list', level='critical')
        return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps(action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps(sorted(wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle(wf_list)
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-' * 100
        print "Looking at", wfname, "for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get('Parameters', None)
        to_acdc = action_list[wfname].get('Action', None) == 'acdc'
        to_clone = action_list[wfname].get('Action', None) == 'clone'
        to_force = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['onhold', 'on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog(
                'actor',
                'Action submitted for something other than acdc, clone, bypass or hold for workflow %s'
                % wfname,
                level='critical')
            print json.dumps(action_list[wfname], indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor',
                    'Empty action submitted for workflow %s' % wfname,
                    level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

        #===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor', 'Going to clone %s' % wfname)
            comment = ""
            if 'comment' in tasks: comment = ", reason: " + tasks['comment']
            wfi.sendLog(
                'actor',
                "invalidating the workflow by traffic controller %s" % comment)

            #Reject all workflows in the family
            inv_results = invalidate(url,
                                     wfi,
                                     only_resub=False,
                                     with_output=True)
            all_good = all(inv_results)
            if all_good:
                wfi.sendLog('actor', "%s and children are rejected" % wfname)
            else:
                wfi.sendLog('actor',
                            "Failed to reject the request and dependents")
                sendLog('actor',
                        'Failed to reject the familly of %s' % wfname,
                        level='critical')
                continue

            cloned = None
            try:
                cloned = singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog(
                    'actor',
                    'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'
                    % wfname,
                    level='critical')
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
            if not cloned:
                recover = False
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                sendLog('actor',
                        'Failed to create clone for %s!' % wfname,
                        level='critical')

            else:
                wfi.sendLog('actor',
                            "Workflow %s cloned into %s" % (wfname, cloned))
                ## set to trouble for swift replacement
                for wfo in session.query(Workflow).filter(
                        Workflow.name == wfname).all():
                    wfo.status = 'trouble'
                session.commit()
#===========================================================
        elif to_force:
            wfi.sendLog(
                'actor',
                'Force-completing from workflow traffic controler request')
            WI.add(action='force',
                   keyword=wfname,
                   user=action_list[wfname].get('user', 'unified'))
        elif to_hold:
            wfi.sendLog('actor',
                        'Holding on workflow traffic controler request')
            WI.add(action='hold',
                   keyword=wfname,
                   user=action_list[wfname].get('user', 'unified'))
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                            tasks[task].append(
                                {setting: allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because WMErr cannot be reached.'
                    % wfname,
                    level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo(
                )
                print "Where to run = "
                print where_to_run
                if not where_to_run:
                    sendLog(
                        'actor',
                        'Cannot create ACDCS for %s because recovery info cannot be found.'
                        % wfname,
                        level='critical')
                    continue
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because recovery info cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because site list cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""

            num_tasks_to_recover = 0

            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for", wfname
#            recover = False

            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
                ## we do not try to recover pLHE
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because it is a pLHE workflow.'
                    % wfname,
                    level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)

#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
#   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                print "Full task name is " + fulltaskname
                print where_to_run.keys()
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in [
                                'Processing', 'Production', 'Merge'
                        ]:
                            wrong_task = True
                            wfi.sendLog(
                                'actor',
                                "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"
                                % (fulltaskname, task_info.taskType))
                if not fulltaskname in where_to_run.keys():
                    wrong_task = True
                    wfi.sendLog(
                        'actor',
                        "Skipping task %s because there is no acdc doc for it anyways."
                        % (fulltaskname))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites = [SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites = list(
                                set([
                                    SI.SE_to_CE(site)
                                    for site in actions[action]
                                ]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(
                        set([
                            SI.SE_to_CE(site)
                            for site in where_to_run[fulltaskname]
                        ]))
                    print "Found", sorted(
                        assign_to_sites
                    ), "as sites where to run the ACDC at, from the acdc doc of ", wfname
                print "Going to run at", sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url,
                                          fulltaskname,
                                          wfi.request,
                                          actions,
                                          do=options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)),
                                    level='critical')
                                wfi.sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
                            #                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else:  #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog(
                            'actor',
                            'ACDC created for task %s. Actions taken \n%s' %
                            (fulltaskname, json.dumps(actions)))
                        jira_comment = "%s created ACDC for task %s with action %s" % (
                            action_list[wfname].get('user', 'unified'),
                            task.split('/')[-1],
                            json.dumps(actions),
                        )
                        reason = action_list[wfname].get('Reason', None)
                        if reason:
                            jira_comment += '\ndue to: %s' % (reason)

                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters = {
                            'SiteWhitelist': sorted(assign_to_sites),
                            'AcquisitionEra': wfi.acquisitionEra(),
                            'ProcessingString': wfi.processingString(),
                            'MergedLFNBase': wfi.request['MergedLFNBase'],
                            'ProcessingVersion':
                            wfi.request['ProcessingVersion'],
                        }
                        ## hackery for ACDC merge assignment
                        if wfi.request[
                                'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                                    '/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request
                              and wfi.request['TrustSitelists'] == 'true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                    'TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                'TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC", acdc
                            parameters['execute'] = True
                            #wfi.sendLog('actor',"%s  was assigned for recovery"% acdc)
                        else:
                            print "no assignment done with this ACDC", acdc
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                            wfi.sendLog(
                                'actor',
                                "%s needs to be assigned by hand" % (acdc))
                            continue

#                       print parameters
                        result = reqMgrClient.assignWorkflow(
                            url, acdc, team, parameters)
                        if not result:
                            print acdc, "was not assigned"
                            sendLog('actor',
                                    "%s failed to be assigned" % (acdc),
                                    level='critical')
                            wfi.sendLog(
                                'actor',
                                "%s failed to get assigned for recovery" %
                                acdc)
                        else:
                            wfi.sendLog('actor',
                                        "%s was assigned for recovery" % acdc)
                            recovering.add(acdc)

                        #wfi.sendLog('actor',"ACDCs created for %s"%wfname)
                        try:
                            if jira_comment:
                                jiras = JC.find(
                                    {'prepid': wfi.request['PrepID']})
                                if len(jiras) == 1:
                                    ## put a comment on the single corresponding ticket
                                    JC.comment(jiras[0].key, jira_comment)
                                    JC.progress(jiras[0].key)
                        except Exception as e:
                            print "failed with JIRA"
                            print str(e)

        #===========================================================

        if recover and options.do:
            r = WC.remove_action(wfname)
            if not r:
                sendLog(
                    'actor',
                    'not able to remove the action, interlocking the module',
                    level='critical')
                os.system('touch %s/actor.failed-%s.lock' %
                          (base_eos_dir, os.getpid()))
                sys.exit(-1)

        ## update the status with recovering removing manual
        for wfo in session.query(Workflow).filter(
                Workflow.name == wfname).all():
            wfo.status = wfo.status.replace('manual', 'recovering')
        session.commit()

        if message_to_user:
            print wfname, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')

    return
Example #35
0
def actor(url,options=None):
    
    mlock = moduleLock(wait=False ,silent=True)
    if mlock(): return
    if userLock('actor'): return
    
    up = componentInfo(soft=['mcm'])
    if not up.check(): return
    
   # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    WC = wtcClient()
    WI = wtcInfo()
    JC = JIRAClient()

    action_list = WC.get_actions()
    if action_list is None:
        print "Not able to load action list"
        sendLog('actor','Not able to load action list', level='critical')
        return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps( action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps( sorted( wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle( wf_list )
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-'*100
        print "Looking at",wfname,"for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get( 'Parameters' , None)
        to_acdc = action_list[wfname].get( 'Action', None) == 'acdc'
        to_clone = action_list[wfname].get( 'Action', None) == 'clone'
        to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['onhold','on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog('actor','Action submitted for something other than acdc, clone, bypass or hold for workflow %s'%wfname,level='critical')
            print json.dumps( action_list[wfname] , indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

#===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor','Going to clone %s'%wfname)
            comment=""
            if 'comment' in tasks: comment = ", reason: "+ tasks['comment']
            wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment)

            #Reject all workflows in the family
            inv_results = invalidate(url, wfi, only_resub=False, with_output=True)
            all_good = all(inv_results)
            if all_good:
                wfi.sendLog('actor',"%s and children are rejected"%wfname)
            else:
                wfi.sendLog('actor',"Failed to reject the request and dependents")
                sendLog('actor','Failed to reject the familly of %s'% wfname, level='critical')
                continue

            cloned = None
            try:  
                cloned =  singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical')
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
            if not cloned:
                recover = False
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                sendLog('actor','Failed to create clone for %s!'%wfname,level='critical')

            else:
                wfi.sendLog('actor',"Workflow %s cloned into %s"%(wfname, cloned))
                ## set to trouble for swift replacement
                for wfo in  session.query(Workflow).filter(Workflow.name == wfname).all():
                    wfo.status = 'trouble'
                session.commit()
#===========================================================
        elif to_force:
            wfi.sendLog('actor','Force-completing from workflow traffic controler request')
            WI.add(action='force', keyword = wfname, user = action_list[wfname].get( 'user', 'unified'))
        elif to_hold:
            wfi.sendLog('actor','Holding on workflow traffic controler request')
            WI.add(action='hold', keyword = wfname, user = action_list[wfname].get( 'user', 'unified'))                   
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                                tasks[task].append({setting:allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves
        
            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor','WMErrors is blank for %s.'%wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run,missing_to_run_at =  wfi.getRecoveryInfo()
                print "Where to run = "
                print where_to_run
                if not where_to_run:
                    sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical')
                    continue
            except:
                sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""
        
            num_tasks_to_recover = 0
        
            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for",wfname
#            recover = False
        
            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
                sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)


#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
        #   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                print "Full task name is " + fulltaskname
                print where_to_run.keys()
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in ['Processing','Production','Merge']:
                            wrong_task= True
                            wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType))
                if not fulltaskname in where_to_run.keys():
                    wrong_task= True
                    wfi.sendLog('actor', "Skipping task %s because there is no acdc doc for it anyways."%(fulltaskname))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites=[SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[fulltaskname]]))
                    print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname
                print "Going to run at",sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical')
                                wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
#                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else: #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,json.dumps(actions)))
                        jira_comment = "%s created ACDC for task %s with action %s"%( 
                            action_list[wfname].get( 'user', 'unified'),
                            task.split('/')[-1] , 
                            json.dumps(actions),
                        )
                        reason = action_list[wfname].get( 'Reason', None)
                        if reason:
                            jira_comment += '\ndue to: %s'%(reason)

                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters={
                        'SiteWhitelist' : sorted(assign_to_sites),
                        'AcquisitionEra' : wfi.acquisitionEra(),
                        'ProcessingString' :  wfi.processingString(),
                        'MergedLFNBase' : wfi.request['MergedLFNBase'],
                        'ProcessingVersion' : wfi.request['ProcessingVersion'],
                        }
                    ## hackery for ACDC merge assignment
                        if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary 
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC",acdc
                            parameters['execute']=True
                            #wfi.sendLog('actor',"%s  was assigned for recovery"% acdc)
                        else:
                            print "no assignment done with this ACDC",acdc
                            sendLog('actor',"%s needs to be assigned"%(acdc), level='critical')
                            wfi.sendLog('actor',"%s needs to be assigned by hand"%(acdc))
                            continue

 #                       print parameters
                        result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                        if not result:
                            print acdc,"was not assigned"
                            sendLog('actor',"%s failed to be assigned"%(acdc), level='critical')
                            wfi.sendLog('actor',"%s failed to get assigned for recovery"% acdc)
                        else:
                            wfi.sendLog('actor',"%s was assigned for recovery"% acdc)
                            recovering.add( acdc )

                        #wfi.sendLog('actor',"ACDCs created for %s"%wfname)
                        try:
                            if jira_comment:
                                jiras = JC.find({'prepid' : wfi.request['PrepID']})
                                if len(jiras)==1:
                                    ## put a comment on the single corresponding ticket
                                    JC.comment(jiras[0].key, jira_comment)
                                    JC.progress(jiras[0].key)
                        except Exception as e:
                            print "failed with JIRA"
                            print str(e)
                        
    
        #===========================================================
        
        
        if recover and options.do:
            r = WC.remove_action(wfname)
            if not r:
                sendLog('actor','not able to remove the action, interlocking the module', level='critical')
                os.system('touch %s/actor.failed-%s.lock'%( base_eos_dir, os.getpid() ))
                sys.exit(-1)

        ## update the status with recovering removing manual
        for wfo in  session.query(Workflow).filter(Workflow.name == wfname).all():
            wfo.status = wfo.status.replace('manual','recovering')
        session.commit()                        

        if message_to_user:
            print wfname,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')



    return