def changeSplittingWorkflow(url, workflow, split, task, split_type='EventAwareLumi'):
    if split_type == 'EventAwareLumi':
        params = getEventAwareLumiParams(split)
    elif split_type == 'Event':
        params = getEventBasedParams(split)
    elif split_type == 'Lumi':
        params = getLumiBasedParams(split)
    elif split_type == 'Merge':
        params = getMergeParams(split)
    params['requestName'] = workflow
    params['splittingTask'] = '/%s/%s'%(workflow, task)
    
    #pprint(params)
    data = reqMgrClient.setWorkflowSplitting(url, params)
    #TODO validate data
    print data
Exemple #2
0
def changeSplittingWorkflow(url,
                            workflow,
                            split,
                            task,
                            split_type='EventAwareLumi'):
    if split_type == 'EventAwareLumi':
        params = getEventAwareLumiParams(split)
    elif split_type == 'Event':
        params = getEventBasedParams(split)
    elif split_type == 'Lumi':
        params = getLumiBasedParams(split)
    elif split_type == 'Merge':
        params = getMergeParams(split)
    params['requestName'] = workflow
    params['splittingTask'] = '/%s/%s' % (workflow, task)

    #pprint(params)
    data = reqMgrClient.setWorkflowSplitting(url, params)
    #TODO validate data
    print data
def singleRecovery(url, task , initial, actions, do=False):
    payload = {
        "Requestor" : os.getenv('USER'),
        "Group" : 'DATAOPS',
        "RequestType" : "Resubmission",
        "ACDCServer" : "https://cmsweb.cern.ch/couchdb",
        "ACDCDatabase" : "acdcserver",
        "OriginalRequestName" : initial['RequestName']
        }
    copy_over = ['PrepID','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ]        
    for c in copy_over:
        payload[c] = copy.deepcopy(initial[c])

    if actions:
        for action in actions:
            #if action.startswith('split'):
            #    factor = int(action.split('-')[-1]) if '-' in action else 2
            #    print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor)
            #    ## mention it's taking 2 times longer to have a 2 times finer splitting
            #    payload['TimePerEvent'] = factor*payload['TimePerEvent']
            if action.startswith('mem'):
                increase = int(action.split('-')[-1]) if '-' in action else 1000
                ## increase the memory requirement by 1G
                payload['Memory'] += increase

    if payload['RequestString'].startswith('ACDC'):
        print "This is not allowed yet"
        return None
    payload['RequestString'] = 'ACDC_'+payload['RequestString']
    payload['InitialTaskPath'] = task 

    if not do:
        print json.dumps( payload, indent=2)
        return None

    ## submit
    response = reqMgrClient.submitWorkflow(url, payload)
    m = re.search("details\/(.*)\'",response)
    if not m:
        print "Error in making ACDC for",initial["RequestName"]
        print response
        response = reqMgrClient.submitWorkflow(url, payload)
        m = re.search("details\/(.*)\'",response)
        if not m:
            print "Error twice in making ACDC for",initial["RequestName"]
            print response
            return None
    acdc = m.group(1)
    
    ## perform modifications
    if actions:
        for action in actions:
            if action.startswith('split'):
                factor = int(action.split('-')[-1]) if '-' in action else 2
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                for split in splittings:
                    for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                        if act in split:
                            print "Changing %s (%d) by a factor %d"%( act, split[act], factor),
                            split[act] /= factor
                            print "to",split[act]
                            break
                    split['requestName'] = acdc
                    print "changing the splitting of",acdc
                    print json.dumps( split, indent=2 )
                    print reqMgrClient.setWorkflowSplitting(url, split )
                
    data = reqMgrClient.setWorkflowApproved(url, acdc)
    print data
    return acdc
Exemple #4
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemple #5
0
def singleRecovery(url, task, initial, actions, do=False):
    payload = {
        "Requestor": os.getenv('USER'),
        "Group": 'DATAOPS',
        "RequestType": "Resubmission",
        "ACDCServer": initial['ConfigCacheUrl'],
        "ACDCDatabase": "acdcserver",
        "OriginalRequestName": initial['RequestName'],
        "OpenRunningTimeout": 0
    }
    copy_over = [
        'PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent',
        'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion'
    ]
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c, "not in the initial payload"

    #a massage ? boost the recovery over the initial wf
    payload['RequestPriority'] *= 2
    payload['RequestPriority'] = min(500000, payload['RequestPriority'])

    if actions:
        for action in actions:
            #if action.startswith('split'):
            #    factor = int(action.split('-')[-1]) if '-' in action else 2
            #    print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor)
            #    ## mention it's taking 2 times longer to have a 2 times finer splitting
            #    payload['TimePerEvent'] = factor*payload['TimePerEvent']
            if action.startswith('mem'):
                arg = action.split('-', 1)[-1]
                increase = set_to = None
                tasks, set_to = arg.split(':') if ':' in arg else (None, arg)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                ## increase the memory requirement by 1G

                if 'TaskChain' in initial:
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            mem = mem_dict.setdefault(tname,
                                                      payload[t]['Memory'])
                            if tasks and not tname in tasks:
                                print tname, "not concerned"
                                continue
                            if set_to:
                                mem_dict[tname] = set_to
                            else:
                                mem_dict[tname] += increase
                        else:
                            break
                    payload['Memory'] = mem_dict
                else:
                    payload['Memory'] = set_to
                #increase = int(action.split('-')[-1]) if '-' in action else 1000
                ## increase the memory requirement by 1G
                #payload['Memory'] += increase

            if action.startswith('split') and (
                    initial['RequestType'] in ['MonteCarlo'] or
                (initial['RequestType'] in ['TaskChain']
                 and not 'InputDataset' in initial['Task1'])):
                print "I should not be doing splitting for this type of request", initial[
                    'RequestName']
                return None
            if action.startswith('core'):
                arg = action.split('-', 1)[-1]
                tasks, set_to = arg.split(':') if ':' in arg else (None, arg)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                if 'TaskChain' in initial:
                    core_dict = {}
                    mem_dict = payload['Memory'] if type(
                        payload['Memory']) == dict else {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            mcore = core_dict.setdefault(
                                tname, payload[t]['Multicore'])
                            mem = mem_dict.setdefault(tname,
                                                      payload[t]['Memory'])
                            if tasks and not tname in tasks:
                                print tname, "not concerned"
                                continue

                            factor = (set_to / float(mcore))
                            fraction_constant = 0.4
                            mem_per_core_c = int(
                                (1 - fraction_constant) * mem / float(mcore))
                            ##scale the memory
                            mem_dict[tname] += (set_to -
                                                mcore) * mem_per_core_c
                            ## scale time/event
                            time_dict[
                                tname] = payload[t]['TimePerEvent'] / factor
                            ## set the number of cores
                            core_dict[tname] = set_to
                        else:
                            break
                    payload['Multicore'] = core_dict
                    ##payload['TimePerEvent'] = time_dict ## cannot be used yet
                else:
                    payload['Multicore'] = increase

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1
        #print acdc_round
        #print "This is not allowed yet"
        #return None
    initial_string = initial_string.replace('ACDC_', '').replace(
        'ACDC%d_' % (acdc_round - 1), '')
    payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string)
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps(payload, indent=2)
        return None

    print "ACDC payload"
    print json.dumps(payload, indent=2)
    print actions

    ## submit
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for", initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for", initial["RequestName"]
            return None

    ## perform modifications
    if actions:
        for action in actions:
            if action.startswith('split'):
                factor = int(action.split('-')[-1]) if '-' in action else 2
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                for split in splittings:
                    for act in [
                            'avg_events_per_job', 'events_per_job',
                            'lumis_per_job'
                    ]:
                        if act in split:
                            print "Changing %s (%d) by a factor %d" % (
                                act, split[act], factor),
                            split[act] /= factor
                            print "to", split[act]
                            break
                    split['requestName'] = acdc
                    print "changing the splitting of", acdc
                    print json.dumps(split, indent=2)
                    print reqMgrClient.setWorkflowSplitting(url, acdc, split)

    data = reqMgrClient.setWorkflowApproved(url, acdc)
    print data
    return acdc
Exemple #6
0
def singleRecovery(url, task, initial, actions, do=False):
    print "Inside single recovery!"
    payload = {
        "Requestor" : os.getenv('USER'),
        "Group" : 'DATAOPS',
        "RequestType" : "Resubmission",
        "ACDCServer" : initial['CouchURL'],
        "ACDCDatabase" : "acdcserver",
        "OriginalRequestName" : initial['RequestName'],
        "OpenRunningTimeout" : 0
    }
    copy_over = ['PrepID','Campaign','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion']
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c,"not in the initial payload"

    #a massage ? boost the recovery over the initial wf
#    payload['RequestPriority'] *= 10
    #Max priority is 1M
    payload['RequestPriority'] = min(500000,  payload['RequestPriority']*2 ) ## never above 500k

    #change parameters based on actions here
    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same':
                payload['Memory'] = actions[action]
                print "Memory set to " + actions[action]
                ## Taskchains needs to be treated special to set the memory to all tasks
                if 'TaskChain' in initial:
                    it = 1
                    while True:
                        t = 'Task%d'%it
                        it += 1
                        if t in initial:
                            payload[t] = copy.deepcopy(initial[t])
                            payload[t]['Memory'] = actions[action]
                        else:
                            break

            if action.startswith('split'):
                split_alert = (initial['RequestType'] in ['MonteCarlo'] )
                for key in initial:
                    if key == 'SplittingAlgo' and (initial[key] in ['EventBased']):
                        split_alert = True
                    elif key.startswith('Task') and key != 'TaskChain':
                        for key2 in initial[key]:
                            if key2 == 'TaskName':
                                print "task",task.split('/')[-1]
                                print "TaskName",initial[key][key2]
                                if (initial[key][key2] == task) and (initial[key][key2] in ['EventBased']):
                                    split_alert = True
                if split_alert:
                    sendLog('actor','Cannot change splitting for %s'%initial['RequestName'],level='warning')
                    print "I should not be doing splitting for this type of request",initial['RequestName']
                    return None

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1

    initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'')
    payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string)
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps( payload, indent=2)
        return None

    print "ACDC payload"
#    print json.dumps( payload , indent=2)
    print actions

    ## submit here
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for",initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for",initial["RequestName"]
            sendLog('actor','Failed twice in making ACDCs for %s!'%initial['RequestName'],level='critical')                
            return None

    ## change splitting if requested
    if actions:
        for action in actions:
            if action.startswith('split'):
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                if actions[action] != 'Same' and actions[action] != 'max':
                    factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split:
                                print "Changing %s (%d) by a factor %d"%( act, split[act], factor),
                                split[act] /= factor
                                print "to",split[act]
                                break
                        split['requestName'] = acdc
                        print "changing the splitting of",acdc
                        print json.dumps( split, indent=2 )
                        print reqMgrClient.setWorkflowSplitting(url, acdc, split )
                elif 'max' in actions[action]:
                    for split in splittings:
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split:
                                print "Changing %s (%d) "%( act, split[act]),
                                split[act] = 1
                                print "to max splitting ",split[act]
                                break
                        split['requestName'] = acdc
                        print "changing the splitting of",acdc
                        print json.dumps( split, indent=2 )
                        print reqMgrClient.setWorkflowSplitting(url, acdc, split )

    data = reqMgrClient.setWorkflowApproved(url, acdc)
    

    print data
    return acdc
Exemple #7
0
def singleClone(url, wfname, actions, comment, do=False):
    
    wfi = workflowInfo(url, wfname)
    payload = wfi.getSchema()
    initial = wfi.request

    payload['Requestor']           = os.getenv('USER')
    payload['Group']               = 'DATAOPS'
    payload['OriginalRequestName'] = initial['RequestName']
    payload['RequestPriority'] = initial['RequestPriority']

    if 'ProcessingVersion' in initial:
        payload['ProcessingVersion'] = int(initial['ProcessingVersion']) +1
    else:
        payload['ProcessingVersion'] = 2


## drop parameters on the way to reqmgr2
    paramBlacklist = ['BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize', 'BlockCloseMaxWaitTime',
                  'CouchWorkloadDBName', 'CustodialGroup', 'CustodialSubType', 'Dashboard',
                  'GracePeriod', 'HardTimeout', 'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize',
                  'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup', 'NonCustodialSubType',
                  'OutputDatasets', 'ReqMgr2Only', 'RequestDate' 'RequestorDN', 'RequestName', 'RequestStatus',
                  'RequestTransition', 'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions',
                  'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists', 'TrustPUSitelists',
                  'TotalEstimatedJobs', 'TotalInputEvents', 'TotalInputLumis', 'TotalInputFiles','checkbox',
                  'DN', 'AutoApproveSubscriptionSites', 'NonCustodialSites', 'CustodialSites', 'OriginalRequestName', 'Teams', 'OutputModulesLFNBases', 
                  'SiteBlacklist', 'AllowOpportunistic', '_id']
    for p in paramBlacklist:
        if p in payload:
            payload.pop( p )
            pass

    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same':
                if 'TaskChain' in payload:
                    print "Setting memory for clone of task chain"
                    it=1
                    while True:
                        t = 'Task%d'%it
                        it+=1
                        if t in payload:
                            payload[t]['Memory'] = actions[action]
                            print "Memory set for Task%d"%it
                        else:
                            break
                else:
                    print "Setting memory for non-taskchain workflow"
                    payload['Memory'] = actions[action]
                print "Memory set to " + actions[action]
                #This line is doesn't work for some reason
#                wfi.sendLog('actor','Memory of clone set to %d'%actions[action])

    print "Clone payload"
#    print json.dumps( payload , indent=2)
    print actions

    #Create clone
    clone = reqMgrClient.submitWorkflow(url, payload)
    if not clone:
        print "Error in making clone for",initial["RequestName"]
        clone = reqMgrClient.submitWorkflow(url, payload)
        if not clone:
            print "Error twice in making clone for",initial["RequestName"]
            sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"],level='critical')
            wfi.sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"])
            return None

    if actions:
        for action in actions:
            if action.startswith('split'):
                cloneinfo = workflowInfo(url, clone)
                splittings = cloneinfo.getSplittings()
                if actions[action] != 'Same' and actions[action] != 'max' and actions[action] != '':
                    factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split:
                                wfi.sendLog('actor','Changing %s (%d) by a factor %d'%( act, split[act], factor))
                                print "Changing %s (%d) by a factor %d"%( act, split[act], factor),
                                split[act] /= factor
                                print "to",split[act]
                                break
                        split['requestName'] = clone
                        print "changing the splitting of",clone
                        print json.dumps( split, indent=2 )
                        print reqMgrClient.setWorkflowSplitting(url, clone, split )
                elif 'max' in actions[action]:
                    for split in splittings:
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split:
                                wfi.sendLog('actor','Max splitting set for %s (%d'%( act, split[act]))
                                print "Changing %s (%d) "%( act, split[act]),
                                split[act] = 1
                                print "to max splitting ",split[act]
                                break
                        split['requestName'] = clone
                        print "changing the splitting of",clone
                        print json.dumps( split, indent=2 )
                        print reqMgrClient.setWorkflowSplitting(url, clone, split )

    #Approve
    data = reqMgrClient.setWorkflowApproved(url, clone)
    wfi.sendLog('actor','Cloned into %s'%clone)

#    wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment ))
#    wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False)

    print data
    return clone
Exemple #8
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())
    aaa_mapping = json.loads(
        open('%s/equalizor.json' % monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update(
        json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read()))
    all_stuck.update(getAllStuckDataset())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))))
                sendLog(
                    'assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))),
                    level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks +
                                  getDatasetBlocks(dataset, runs=rwl)))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(dataset, lumis=lwl)))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From/after secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = list(
                    set([
                        SI.SE_to_CE(psite)
                        for (psite, (there, frac)) in presence.items() if there
                    ]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = list(
                    set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))

        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemple #9
0
def singleClone(url, wfname, actions, comment, do=False):
    
    wfi = workflowInfo(url, wfname)
    payload = wfi.getSchema()
    initial = wfi.request

    payload['Requestor']           = os.getenv('USER')
    payload['Group']               = 'DATAOPS'
    payload['OriginalRequestName'] = initial['RequestName']
    payload['RequestPriority'] = initial['RequestPriority']

    if 'ProcessingVersion' in initial:
        payload['ProcessingVersion'] = int(initial['ProcessingVersion']) +1
    else:
        payload['ProcessingVersion'] = 2

        
    payload = reqMgrClient.purgeClonedSchema( payload )

    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same':
                if 'TaskChain' in payload:
                    print "Setting memory for clone of task chain"
                    mem_dict = {}
                    it=1
                    while True:
                        t = 'Task%d'%it
                        it+=1
                        if t in payload:
                            tname = payload[t]['TaskName']
                            mem_dict[tname] = int(actions[action])
                            print "Memory set for Task%d"%it
                        else:
                            break
                    payload['Memory'] = mem_dict
                else:
                    print "Setting memory for non-taskchain workflow"
                    payload['Memory'] = int(actions[action])
                print "Memory set to " + actions[action]

    print "Clone payload"
#    print json.dumps( payload , indent=2)
    print actions

    #Create clone
    clone = reqMgrClient.submitWorkflow(url, payload)
    if not clone:
        print "Error in making clone for",initial["RequestName"]
        clone = reqMgrClient.submitWorkflow(url, payload)
        if not clone:
            print "Error twice in making clone for",initial["RequestName"]
            sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"],level='critical')
            wfi.sendLog('actor','Failed to make a clone twice for %s!'%initial["RequestName"])
            return None

    if actions:
        for action in actions:
            if action.startswith('split'):
                cloneinfo = workflowInfo(url, clone)
                splittings = cloneinfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[action] != 'max' and actions[action] != '':
                    factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split_par:
                                wfi.sendLog('actor','Changing %s (%d) by a factor %d'%( act, split_par[act], factor))
                                split_par[act] /= factor
                                print "to",split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )
                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split_par:
                                wfi.sendLog('actor','Max splitting set for %s (%d'%( act, split_par[act]))
                                print "Changing %s (%d) "%( act, split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ",split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )

                print "changing the splitting of",clone
                print json.dumps( splittings, indent=2 )
                print reqMgrClient.setWorkflowSplitting(url, clone, splittings )
    #Approve
    data = reqMgrClient.setWorkflowApproved(url, clone)
    #wfi.sendLog('actor','Cloned into %s'%clone)

    
#    wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment ))
#    wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False)

    print data
    return clone
def singleRecovery(url, task , initial, actions, do=False):
    payload = {
        "Requestor" : os.getenv('USER'),
        "Group" : 'DATAOPS',
        "RequestType" : "Resubmission",
        "ACDCServer" : initial['ConfigCacheUrl'],
        "ACDCDatabase" : "acdcserver",
        "OriginalRequestName" : initial['RequestName'],
        "OpenRunningTimeout" : 0
        }
    copy_over = ['PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion']        
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c,"not in the initial payload"

    #a massage ? boost the recovery over the initial wf
    payload['RequestPriority'] *= 2
    payload['RequestPriority'] = min(500000, payload['RequestPriority'])

    if actions:
        for action in actions:
            #if action.startswith('split'):
            #    factor = int(action.split('-')[-1]) if '-' in action else 2
            #    print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor)
            #    ## mention it's taking 2 times longer to have a 2 times finer splitting
            #    payload['TimePerEvent'] = factor*payload['TimePerEvent']
            if action.startswith('mem'):
                arg = action.split('-',1)[-1]
                increase = set_to = None
                tasks,set_to = arg.split(':') if ':' in arg else (None,arg)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                ## increase the memory requirement by 1G

                if 'TaskChain' in initial:
                    mem_dict = {} 
                    it = 1
                    while True:
                        t = 'Task%d'%it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t, initial[t])['TaskName']
                            mem = mem_dict.setdefault( tname, payload[t]['Memory'])
                            if tasks and not tname in tasks:
                                print tname,"not concerned"
                                continue
                            if set_to:
                                mem_dict[tname] = set_to
                            else:
                                mem_dict[tname] += increase
                        else:
                            break
                    payload['Memory'] = mem_dict
                else:
                    payload['Memory'] = set_to
                #increase = int(action.split('-')[-1]) if '-' in action else 1000
                ## increase the memory requirement by 1G
                #payload['Memory'] += increase

            if action.startswith('split') and (initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])):
                print "I should not be doing splitting for this type of request",initial['RequestName']
                return None
            if action.startswith('core'):
                arg = action.split('-',1)[-1]
                tasks,set_to = arg.split(':') if ':' in arg else (None,arg)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                if 'TaskChain' in initial:
                    core_dict = {}
                    mem_dict = payload['Memory'] if type(payload['Memory'])==dict else {}
                    it = 1
                    while True:
                        t = 'Task%d'%it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t, initial[t])['TaskName']
                            mcore = core_dict.setdefault(tname, payload[t]['Multicore'])
                            mem = mem_dict.setdefault(tname, payload[t]['Memory'])
                            if tasks and not tname in tasks:
                                print tname,"not concerned"
                                continue

                            factor = (set_to / float(mcore))
                            fraction_constant = 0.4 
                            mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                            ##scale the memory 
                            mem_dict[tname] += (set_to-mcore)*mem_per_core_c
                            ## scale time/event
                            time_dict[tname] = payload[t]['TimePerEvent'] /factor
                            ## set the number of cores
                            core_dict[tname] = set_to
                        else: 
                            break
                    payload['Multicore'] = core_dict
                    ##payload['TimePerEvent'] = time_dict ## cannot be used yet
                else:
                    payload['Multicore'] = increase

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1
        #print acdc_round
        #print "This is not allowed yet"
        #return None
    initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'')
    payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string)
    payload['InitialTaskPath'] = task 

    if not do:
        print json.dumps( payload, indent=2)
        return None

    print "ACDC payload"
    print json.dumps( payload , indent=2)
    print actions

    ## submit
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for",initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for",initial["RequestName"]
            return None
    
    ## perform modifications
    if actions:
        for action in actions:
            if action.startswith('split'):
                factor = int(action.split('-')[-1]) if '-' in action else 2
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                for split in splittings:
                    for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                        if act in split:
                            print "Changing %s (%d) by a factor %d"%( act, split[act], factor),
                            split[act] /= factor
                            print "to",split[act]
                            break
                    split['requestName'] = acdc
                    print "changing the splitting of",acdc
                    print json.dumps( split, indent=2 )
                    print reqMgrClient.setWorkflowSplitting(url, acdc, split )
                
    data = reqMgrClient.setWorkflowApproved(url, acdc)
    print data
    return acdc
Exemple #11
0
def singleClone(url, wfname, actions, comment, do=False):

    wfi = workflowInfo(url, wfname)
    payload = wfi.getSchema()
    initial = wfi.request

    payload['Requestor'] = os.getenv('USER')
    payload['Group'] = 'DATAOPS'
    payload['OriginalRequestName'] = initial['RequestName']
    payload['RequestPriority'] = initial['RequestPriority']

    if 'ProcessingVersion' in initial:
        payload['ProcessingVersion'] = int(initial['ProcessingVersion']) + 1
    else:
        payload['ProcessingVersion'] = 2


## drop parameters on the way to reqmgr2
    paramBlacklist = [
        'BlockCloseMaxEvents', 'BlockCloseMaxFiles', 'BlockCloseMaxSize',
        'BlockCloseMaxWaitTime', 'CouchWorkloadDBName', 'CustodialGroup',
        'CustodialSubType', 'Dashboard', 'GracePeriod', 'HardTimeout',
        'InitialPriority', 'inputMode', 'MaxMergeEvents', 'MaxMergeSize',
        'MaxRSS', 'MaxVSize', 'MinMergeSize', 'NonCustodialGroup',
        'NonCustodialSubType', 'OutputDatasets', 'ReqMgr2Only', 'RequestDate'
        'RequestorDN', 'RequestName', 'RequestStatus', 'RequestTransition',
        'RequestWorkflow', 'SiteWhitelist', 'SoftTimeout', 'SoftwareVersions',
        'SubscriptionPriority', 'Team', 'timeStamp', 'TrustSitelists',
        'TrustPUSitelists', 'TotalEstimatedJobs', 'TotalInputEvents',
        'TotalInputLumis', 'TotalInputFiles', 'checkbox', 'DN',
        'AutoApproveSubscriptionSites', 'NonCustodialSites', 'CustodialSites',
        'OriginalRequestName', 'Teams', 'OutputModulesLFNBases',
        'SiteBlacklist', 'AllowOpportunistic', '_id', 'Override'
    ]
    for p in paramBlacklist:
        if p in payload:
            payload.pop(p)

    taskParamBlacklist = ['EventsPerJob']
    for i in range(1, 100):
        t = 'Task%s' % i
        if not t in payload: break
        for p in taskParamBlacklist:
            if p in payload[t]:
                payload[t].pop(p)

    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[
                    action] != 'Same':
                if 'TaskChain' in payload:
                    print "Setting memory for clone of task chain"
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in payload:
                            tname = payload[t]['TaskName']
                            mem_dict[tname] = int(actions[action])
                            print "Memory set for Task%d" % it
                        else:
                            break
                    payload['Memory'] = mem_dict
                else:
                    print "Setting memory for non-taskchain workflow"
                    payload['Memory'] = int(actions[action])
                print "Memory set to " + actions[action]

    print "Clone payload"
    #    print json.dumps( payload , indent=2)
    print actions

    #Create clone
    clone = reqMgrClient.submitWorkflow(url, payload)
    if not clone:
        print "Error in making clone for", initial["RequestName"]
        clone = reqMgrClient.submitWorkflow(url, payload)
        if not clone:
            print "Error twice in making clone for", initial["RequestName"]
            sendLog('actor',
                    'Failed to make a clone twice for %s!' %
                    initial["RequestName"],
                    level='critical')
            wfi.sendLog(
                'actor', 'Failed to make a clone twice for %s!' %
                initial["RequestName"])
            return None

    if actions:
        for action in actions:
            if action.startswith('split'):
                cloneinfo = workflowInfo(url, clone)
                splittings = cloneinfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[
                        action] != 'max' and actions[action] != '':
                    factor = int(
                        actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                wfi.sendLog(
                                    'actor',
                                    'Changing %s (%d) by a factor %d' %
                                    (act, split_par[act], factor))
                                split_par[act] /= factor
                                print "to", split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )
                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                wfi.sendLog(
                                    'actor', 'Max splitting set for %s (%d' %
                                    (act, split_par[act]))
                                print "Changing %s (%d) " % (act,
                                                             split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ", split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )

                print "changing the splitting of", clone
                print json.dumps(splittings, indent=2)
                print reqMgrClient.setWorkflowSplitting(url, clone, splittings)
    #Approve
    data = reqMgrClient.setWorkflowApproved(url, clone)
    wfi.sendLog('actor', 'Cloned into %s' % clone)

    #    wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment ))
    #    wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False)

    print data
    return clone
Exemple #12
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered','staging'])
    if specific:
        fetch_from.extend(['considered-tried'])


    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from",fetch_from

    for status in fetch_from:
        print "getting wf in",status
        wfos.extend(session.query(Workflow).filter(Workflow.status==status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())
    aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping']
    all_stuck = set()
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() ))
    all_stuck.update( getAllStuckDataset()) 

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )



    for wfo in wfos:
        
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)

        if wfh.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled+=1
            continue


        if options.priority and int(wfh.request['RequestPriority']) < options.priority:
            continue

        options_text=""
        if options.early: options_text+=", early option is ON"
        if options.partial: 
            options_text+=", partial option is ON"
            options_text+=", good fraction is %.2f"%options.good_enough
        


        wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled+=1
            wfh.sendLog('assignor','There is no output at all')
            sendLog('assignor','Workflow %s has no output at all'%( wfo.name), level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update( CI.campaigns[campaign] )

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update( allowed_secondary[sec] )

        if no_go:
            n_stalled+=1
            ## make a very loud noise if >100k priority stalled
            continue


            
        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=rwl ) ))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, lumis=lwl)))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns:
            assign_parameters.update( CI.campaigns[wfh.request['Campaign']] )

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))
            
        do_partial = options.good_enough if options.partial else do_partial


        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction( url, sec )
                    if sec_availability >=1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog('assignor',"The secondary %s is available %s times on disk, and usable"%( sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = [] ## will block the assignment
                        wfh.sendLog('assignor',"The secondary %s is nowhere on disk"% sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"Intersecting with secondary requirement, now allowed %s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            ## if they are requested for processing, they should bbe all closed already
            closeAllBlocks(url, prim, blocks)
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            if primary_aaa:
                available_fractions[prim] =  getDatasetBlocksFraction(url, prim, only_blocks = blocks)

            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            if primary_aaa:
                sites_all_data = set()
                for (psite,(there,frac)) in presence.items():
                    if there:
                        sites_all_data.update( SI.SE_to_CEs(psite) )
                sites_all_data = list(sites_all_data)
                #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there]))
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            if primary_aaa:
                sites_with_any_data = set()
                for psite in presence.keys():
                    sites_with_any_data.update( SI.SE_to_CEs(psite) )
                sites_with_any_data = list(sites_with_any_data)
                #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            holding_but_not_allowed = set()
            for se_site in presence.keys():
                if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)):
                    holding_but_not_allowed.add( se_site )
            #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted( holding_but_not_allowed ))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off")
                primary_aaa=False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update( aaa_mapping.get(site,[]) )
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed))
                
        isStoreResults = ( 'StoreResults' == wfh.request.setdefault('RequestType',None) )

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled+= 1
                wfh.sendLog('assignor',"Cannot assign StoreResults request because MergedLFN is missing")
                sendLog('assignor','Cannot assign StoreResults request because MergedLFN is missing', level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and 
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist'] 
                else: 
                    wfh.sendLog('assignor',"Cannot assign StoreResults request because SiteWhitelist is missing")
                    sendLog('assignor','Cannot assign StoreResults request because SiteWhitelist is missing', level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints",sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled+=1
                continue
            
            
        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue


        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low )))
            copies_wanted = max(1., copies_wanted-1.)


        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            above_good = all([available >= do_partial for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                n_stalled+=1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good):
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled+=1
                    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',"cannot be assign with no matched sites")
                sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue


        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed))            

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed))            

        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team


        if lheinput:
            ## throttle reading LHE article 
            wfh.sendLog('assignor', 'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v=getattr(options,key)
                    if v!=None:
                        if type(v)==str and ',' in v: 
                            parameters[key] = filter(None,v.split(','))
                        else: 
                            parameters[key] = v

        def pick_campaign( assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update( assign_parameters.get('parameters',{}) )

        if options.force_options:
            pick_campaign( assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign( assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog('assignor','Holding on to the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor','Change of splitting is on hold')                
            n_stalled+=1
            continue            

        if split_check==None or split_check==False:
            n_stalled+=1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, 
                                              wfo.name,
                                              split_check)
            wfh.sendLog('assignor','Applying the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check])))

        split_check = True ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**'])
        
        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites',[])))
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, None, parameters) ## team is not relevant anymore here


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock( secure, reason = 'assigning')

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                wfh.sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical')
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',"%s workflows cannot be assigned. Please take a look"%(n_stalled), level='critical')
Exemple #13
0
def singleRecovery(url, task, initial, actions, do=False):
    payload = {
        "Requestor": os.getenv('USER'),
        "Group": 'DATAOPS',
        "RequestType": "Resubmission",
        "ACDCServer": initial['CouchURL'],
        "ACDCDatabase": "acdcserver",
        "OriginalRequestName": initial['RequestName']
    }
    copy_over = [
        'PrepID', 'RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group',
        'Memory', 'RequestString', 'CMSSWVersion'
    ]
    for c in copy_over:
        payload[c] = copy.deepcopy(initial[c])

    #a massage ? boost the recovery over the initial wf
    payload['RequestPriority'] *= 10

    if actions:
        for action in actions:
            #if action.startswith('split'):
            #    factor = int(action.split('-')[-1]) if '-' in action else 2
            #    print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor)
            #    ## mention it's taking 2 times longer to have a 2 times finer splitting
            #    payload['TimePerEvent'] = factor*payload['TimePerEvent']
            if action.startswith('mem'):
                increase = int(
                    action.split('-')[-1]) if '-' in action else 1000
                ## increase the memory requirement by 1G
                payload['Memory'] += increase
            if action.startswith('split') and (
                    initial['RequestType'] in ['MonteCarlo'] or
                (initial['RequestType'] in ['TaskChain']
                 and not 'InputDataset' in initial['Task1'])):
                print "I should not be doing splitting for this type of request", initial[
                    'RequestName']
                return None

    if payload['RequestString'].startswith('ACDC'):
        print "This is not allowed yet"
        return None
    payload['RequestString'] = 'ACDC_' + payload['RequestString']
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps(payload, indent=2)
        return None

    print json.dumps(payload, indent=2)

    ## submit
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for", initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for", initial["RequestName"]
            return None

    ## perform modifications
    if actions:
        for action in actions:
            if action.startswith('split'):
                factor = int(action.split('-')[-1]) if '-' in action else 2
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                for split in splittings:
                    for act in [
                            'avg_events_per_job', 'events_per_job',
                            'lumis_per_job'
                    ]:
                        if act in split:
                            print "Changing %s (%d) by a factor %d" % (
                                act, split[act], factor),
                            split[act] /= factor
                            print "to", split[act]
                            break
                    split['requestName'] = acdc
                    print "changing the splitting of", acdc
                    print json.dumps(split, indent=2)
                    print reqMgrClient.setWorkflowSplitting(url, acdc, split)

    data = reqMgrClient.setWorkflowApproved(url, acdc)
    print data
    return acdc
Exemple #14
0
def singleRecovery(url, task, initial, actions, do=False):
    print "Inside single recovery!"
    payload = {
        "Requestor": os.getenv('USER'),
        "Group": 'DATAOPS',
        "RequestType": "Resubmission",
        "ACDCServer": initial['ConfigCacheUrl'],
        "ACDCDatabase": "acdcserver",
        "OriginalRequestName": initial['RequestName'],
        "OpenRunningTimeout": 0
    }
    copy_over = [
        'PrepID', 'Campaign', 'RequestPriority', 'TimePerEvent',
        'SizePerEvent', 'Group', 'Memory', 'RequestString', 'CMSSWVersion'
    ]
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c, "not in the initial payload"

    #a massage ? boost the recovery over the initial wf


#    payload['RequestPriority'] *= 10
#Max priority is 1M
    payload['RequestPriority'] = min(500000, payload['RequestPriority'] *
                                     2)  ## never above 500k

    #change parameters based on actions here
    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[
                    action] != 'Same':
                #if multicore parameter is also used, need to scale memory by the new number of cores
                if 'multicore' in actions and actions['multicore'] != "":
                    continue
                ## Taskchains needs to be treated special to set the memory to all tasks
                set_to = int(actions[action])
                if 'TaskChain' in initial:
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            mem = mem_dict.setdefault(tname,
                                                      payload[t]['Memory'])
                            mem_dict[tname] = set_to
                        else:
                            break
                    payload['Memory'] = mem_dict
                    print "Memory set to: ", json.dumps(mem_dict, indent=2)
                else:
                    payload['Memory'] = set_to
                    print "Memory set to: ", set_to

            if action.startswith('multicore') and actions[action] != "":
                set_to = int(actions[action])
                ## Taskchains needs to be treated special to set the multicore and memory values to all tasks
                if 'TaskChain' in initial:
                    mem_dict = payload['Memory'] if type(
                        payload['Memory']) == dict else {}
                    core_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t,
                                                       initial[t])['TaskName']
                            mem = mem_dict.setdefault(tname,
                                                      payload[t]['Memory'])

                            #Need to scale the memory by the new number of cores
                            initial_cores = payload[t].setdefault(
                                'Multicore', 1)

                            if 'memory' in actions and actions[
                                    'memory'] != "" and actions[
                                        'memory'] != 'Same':
                                mem = actions['memory']

                            fraction_constant = 0.4
                            mem_per_core_c = int((1 - fraction_constant) *
                                                 mem / float(initial_cores))

                            mem_dict[tname] = int(mem +
                                                  (set_to - initial_cores) *
                                                  mem_per_core_c)
                            core_dict[tname] = set_to

                            print "For ", t
                            print "Multicore set to ", set_to
                            print "Memory set to ", mem_dict[tname]
                        else:
                            break
                    payload['Memory'] = mem_dict
                    payload['Multicore'] = core_dict

                else:
                    #Need to scale the memory by the new number of cores
                    initial_cores = initial.setdefault('Multicore', 1)

                    mem = payload['Memory']
                    if 'memory' in actions and actions[
                            'memory'] != "" and actions['memory'] != 'Same':
                        mem = actions['memory']

                    fraction_constant = 0.4
                    mem_per_core_c = int(
                        (1 - fraction_constant) * mem / float(initial_cores))

                    payload['Multicore'] = set_to
                    payload['Memory'] = int(mem + (set_to - initial_cores) *
                                            mem_per_core_c)

                    print "Multicore set to ", set_to
                    print "Memory set to ", payload['Memory']

            if action.startswith('split'):

                split_alert = (initial['RequestType'] in ['MonteCarlo'])
                for key in initial:
                    if key == 'SplittingAlgo' and (initial[key]
                                                   in ['EventBased']):
                        split_alert = True
                    elif key.startswith('Task') and key != 'TaskChain':
                        for key2 in initial[key]:
                            if key2 == 'TaskName':
                                this_taskname = initial[key][key2]
                                recover_task = task.split('/')[-1]
                                print "For recovery of task", recover_task
                                print "Looking at task", this_taskname
                                if (recover_task == this_taskname) and (
                                        initial[key]['SplittingAlgo']
                                        in ['EventBased']):
                                    ## the task to be recovered is actually of the wrong type to allow change of splitting
                                    sendLog(
                                        'actor',
                                        'To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.'
                                        %
                                        (task, initial[key]['SplittingAlgo']),
                                        level='critical')
                                    ## do not send an alert and stop the acdc
                                    #split_alert = True

                if split_alert:
                    sendLog('actor',
                            'Cannot change splitting for %s' %
                            initial['RequestName'],
                            level='critical')
                    print "I should not be doing splitting for this type of request", initial[
                        'RequestName']
                    return None

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1

    initial_string = initial_string.replace('ACDC_', '').replace(
        'ACDC%d_' % (acdc_round - 1), '')
    payload['RequestString'] = 'ACDC%d_%s' % (acdc_round, initial_string)
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps(payload, indent=2)
        return None

    print "ACDC payload"
    #    print json.dumps( payload , indent=2)
    print actions

    ## submit here
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for", initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for", initial["RequestName"]
            sendLog('actor',
                    'Failed twice in making ACDCs for %s!' %
                    initial['RequestName'],
                    level='critical')
            return None

    ## change splitting if requested
    if actions:
        for action in actions:
            if action.startswith('split'):
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[action] != 'max':
                    factor = int(
                        actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        if split['splitAlgo'] in ['EventBased']:
                            sendLog(
                                'actor',
                                "Changing the splitting on %s for %s is not permitted. Not changing."
                                % (split['splitAlgo'], initial["RequestName"]),
                                level='critical')
                            continue
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                print "Changing %s (%d) by a factor %d" % (
                                    act, split_par[act], factor),
                                split_par[act] /= factor
                                print "to", split_par[act]
                                break
                        #split['requestName'] = acdc
                        #print "changing the splitting of",acdc
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, acdc, split )

                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                print "Changing %s (%d) " % (act,
                                                             split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ", split_par[act]
                                break
                        #split['requestName'] = acdc
                        #print "changing the splitting of",acdc
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, acdc, split )
                print "changing the splitting of", acdc
                print json.dumps(splittings, indent=2)
                done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings)
                ## check on done == True

    data = reqMgrClient.setWorkflowApproved(url, acdc)

    print data
    return acdc
def singleRecovery(url, task , initial, actions, do=False):
    payload = {
        "Requestor" : os.getenv('USER'),
        "Group" : 'DATAOPS',
        "RequestType" : "Resubmission",
        "ACDCServer" : initial['CouchURL'],
        "ACDCDatabase" : "acdcserver",
        "OriginalRequestName" : initial['RequestName']
        }
    copy_over = ['PrepID','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion']        
    for c in copy_over:
        payload[c] = copy.deepcopy(initial[c])

    #a massage ? boost the recovery over the initial wf
    payload['RequestPriority'] *= 10

    if actions:
        for action in actions:
            #if action.startswith('split'):
            #    factor = int(action.split('-')[-1]) if '-' in action else 2
            #    print "Changing time per event (%s) by a factor %d"%( payload['TimePerEvent'], factor)
            #    ## mention it's taking 2 times longer to have a 2 times finer splitting
            #    payload['TimePerEvent'] = factor*payload['TimePerEvent']
            if action.startswith('mem'):
                increase = int(action.split('-')[-1]) if '-' in action else 1000
                ## increase the memory requirement by 1G
                payload['Memory'] += increase
            if action.startswith('split') and (initial['RequestType'] in ['MonteCarlo'] or (initial['RequestType'] in ['TaskChain'] and not 'InputDataset' in initial['Task1'])):
                print "I should not be doing splitting for this type of request",initial['RequestName']
                return None

    if payload['RequestString'].startswith('ACDC'):
        print "This is not allowed yet"
        return None
    payload['RequestString'] = 'ACDC_'+payload['RequestString']
    payload['InitialTaskPath'] = task 

    if not do:
        print json.dumps( payload, indent=2)
        return None

    print json.dumps( payload , indent=2)

    ## submit
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for",initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for",initial["RequestName"]
            return None
    
    ## perform modifications
    if actions:
        for action in actions:
            if action.startswith('split'):
                factor = int(action.split('-')[-1]) if '-' in action else 2
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittings()
                for split in splittings:
                    for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                        if act in split:
                            print "Changing %s (%d) by a factor %d"%( act, split[act], factor),
                            split[act] /= factor
                            print "to",split[act]
                            break
                    split['requestName'] = acdc
                    print "changing the splitting of",acdc
                    print json.dumps( split, indent=2 )
                    print reqMgrClient.setWorkflowSplitting(url, acdc, split )
                
    data = reqMgrClient.setWorkflowApproved(url, acdc)
    print data
    return acdc
Exemple #16
0
def singleClone(url, wfname, actions, comment, do=False):

    wfi = workflowInfo(url, wfname)
    payload = wfi.getSchema()
    initial = wfi.request

    payload['Requestor'] = os.getenv('USER')
    payload['Group'] = 'DATAOPS'
    payload['OriginalRequestName'] = initial['RequestName']
    payload['RequestPriority'] = initial['RequestPriority']

    if 'ProcessingVersion' in initial:
        payload['ProcessingVersion'] = int(initial['ProcessingVersion']) + 1
    else:
        payload['ProcessingVersion'] = 2

    payload = reqMgrClient.purgeClonedSchema(payload)

    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[
                    action] != 'Same':
                if 'TaskChain' in payload:
                    print "Setting memory for clone of task chain"
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d' % it
                        it += 1
                        if t in payload:
                            tname = payload[t]['TaskName']
                            mem_dict[tname] = int(actions[action])
                            print "Memory set for Task%d" % it
                        else:
                            break
                    payload['Memory'] = mem_dict
                else:
                    print "Setting memory for non-taskchain workflow"
                    payload['Memory'] = int(actions[action])
                print "Memory set to " + actions[action]

    print "Clone payload"
    #    print json.dumps( payload , indent=2)
    print actions

    #Create clone
    clone = reqMgrClient.submitWorkflow(url, payload)
    if not clone:
        print "Error in making clone for", initial["RequestName"]
        clone = reqMgrClient.submitWorkflow(url, payload)
        if not clone:
            print "Error twice in making clone for", initial["RequestName"]
            sendLog('actor',
                    'Failed to make a clone twice for %s!' %
                    initial["RequestName"],
                    level='critical')
            wfi.sendLog(
                'actor', 'Failed to make a clone twice for %s!' %
                initial["RequestName"])
            return None

    if actions:
        for action in actions:
            if action.startswith('split'):
                cloneinfo = workflowInfo(url, clone)
                splittings = cloneinfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[
                        action] != 'max' and actions[action] != '':
                    factor = int(
                        actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                wfi.sendLog(
                                    'actor',
                                    'Changing %s (%d) by a factor %d' %
                                    (act, split_par[act], factor))
                                split_par[act] /= factor
                                print "to", split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )
                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in [
                                'avg_events_per_job', 'events_per_job',
                                'lumis_per_job'
                        ]:
                            if act in split_par:
                                wfi.sendLog(
                                    'actor', 'Max splitting set for %s (%d' %
                                    (act, split_par[act]))
                                print "Changing %s (%d) " % (act,
                                                             split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ", split_par[act]
                                break
                        #split['requestName'] = clone
                        #print "changing the splitting of",clone
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, clone, split )

                print "changing the splitting of", clone
                print json.dumps(splittings, indent=2)
                print reqMgrClient.setWorkflowSplitting(url, clone, splittings)
    #Approve
    data = reqMgrClient.setWorkflowApproved(url, clone)
    #wfi.sendLog('actor','Cloned into %s'%clone)

    #    wfi.sendLog('actor','Cloned into %s by unified operator %s'%( clone, comment ))
    #    wfi.notifyRequestor('Cloned into %s by unified operator %s'%( clone, comment ),do_batch=False)

    print data
    return clone
Exemple #17
0
def singleRecovery(url, task, initial, actions, do=False):
    print "Inside single recovery!"
    payload = {
        "Requestor" : os.getenv('USER'),
        "Group" : 'DATAOPS',
        "RequestType" : "Resubmission",
        "ACDCServer" : initial['ConfigCacheUrl'],
        "ACDCDatabase" : "acdcserver",
        "OriginalRequestName" : initial['RequestName'],
        "OpenRunningTimeout" : 0
    }
    copy_over = ['PrepID','Campaign','RequestPriority', 'TimePerEvent', 'SizePerEvent', 'Group', 'Memory', 'RequestString' ,'CMSSWVersion']
    for c in copy_over:
        if c in initial:
            payload[c] = copy.deepcopy(initial[c])
        else:
            print c,"not in the initial payload"

    #a massage ? boost the recovery over the initial wf
#    payload['RequestPriority'] *= 10
    #Max priority is 1M
    payload['RequestPriority'] = min(500000,  payload['RequestPriority']*2 ) ## never above 500k

    #change parameters based on actions here
    if actions:
        for action in actions:
            if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same':
                #if multicore parameter is also used, need to scale memory by the new number of cores
                if 'multicore' in actions and actions['multicore'] != "":
                    continue
                ## Taskchains needs to be treated special to set the memory to all tasks
                set_to = int(actions[action])
                if 'TaskChain' in initial:
                    mem_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d'%it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t, initial[t])['TaskName']
                            mem = mem_dict.setdefault( tname, payload[t]['Memory'])
                            mem_dict[tname] = set_to
                        else:
                            break
                    payload['Memory'] = mem_dict
                    print "Memory set to: ",json.dumps( mem_dict, indent=2)
                else: 
                    payload['Memory'] = set_to
                    print "Memory set to: ", set_to


            if action.startswith('multicore') and actions[action] != "":
                set_to = int(actions[action] )
                ## Taskchains needs to be treated special to set the multicore and memory values to all tasks
                if 'TaskChain' in initial:
                    mem_dict  = payload['Memory'] if type(payload['Memory'])==dict else {}
                    core_dict = {}
                    it = 1
                    while True:
                        t = 'Task%d'%it
                        it += 1
                        if t in initial:
                            tname = payload.setdefault(t, initial[t])['TaskName']
                            mem = mem_dict.setdefault( tname, payload[t]['Memory'])

                            #Need to scale the memory by the new number of cores
                            initial_cores = payload[t].setdefault('Multicore', 1) 

                            if 'memory' in actions and actions['memory'] != "" and actions['memory'] != 'Same':
                                mem = actions['memory']

                            fraction_constant = 0.4
                            mem_per_core_c = int (( 1 - fraction_constant) * mem / float(initial_cores) )
                            
                            mem_dict[tname] = int ( mem + (set_to - initial_cores)*mem_per_core_c )
                            core_dict[tname] = set_to

                            print "For ", t
                            print "Multicore set to ", set_to
                            print "Memory set to ", mem_dict[tname]
                        else:
                           break
                    payload['Memory']    = mem_dict
                    payload['Multicore'] = core_dict
                                        
                else:
                    #Need to scale the memory by the new number of cores
                    initial_cores = initial.setdefault('Multicore', 1) 

                    mem = payload['Memory']
                    if 'memory' in actions and actions['memory'] != "" and actions['memory'] != 'Same' :
                        mem = actions['memory']

                    fraction_constant = 0.4
                    mem_per_core_c = int (( 1 - fraction_constant) * mem / float(initial_cores) )

                    payload['Multicore'] = set_to
                    payload['Memory'] = int ( mem + (set_to - initial_cores)*mem_per_core_c )

                    print "Multicore set to ", set_to
                    print "Memory set to ", payload['Memory']


            if action.startswith('split'):
                
                split_alert = (initial['RequestType'] in ['MonteCarlo'] )
                for key in initial:
                    if key == 'SplittingAlgo' and (initial[key] in ['EventBased']):
                        split_alert = True
                    elif key.startswith('Task') and key != 'TaskChain':
                        for key2 in initial[key]:
                            if key2 == 'TaskName':
                                this_taskname = initial[key][key2]
                                recover_task = task.split('/')[-1]
                                print "For recovery of task",recover_task
                                print "Looking at task",this_taskname
                                if (recover_task == this_taskname) and (initial[key]['SplittingAlgo'] in ['EventBased']):
                                    ## the task to be recovered is actually of the wrong type to allow change of splitting
                                    sendLog('actor','To recover on %s, changing the splitting on %s is not really allowed and this will be ignored instead of failing acdc.'%( task, initial[key]['SplittingAlgo']), level='critical')
                                    ## do not send an alert and stop the acdc
                                    #split_alert = True

                if split_alert:
                    sendLog('actor','Cannot change splitting for %s'%initial['RequestName'],level='critical')
                    print "I should not be doing splitting for this type of request",initial['RequestName']
                    return None

    acdc_round = 0
    initial_string = payload['RequestString']
    if initial_string.startswith('ACDC'):
        if initial_string[4].isdigit():
            acdc_round = int(initial_string[4])
        acdc_round += 1

    initial_string = initial_string.replace('ACDC_','').replace('ACDC%d_'%(acdc_round-1),'')
    payload['RequestString'] = 'ACDC%d_%s'%(acdc_round,initial_string)
    payload['InitialTaskPath'] = task

    if not do:
        print json.dumps( payload, indent=2)
        return None

    print "ACDC payload"
#    print json.dumps( payload , indent=2)
    print actions

    ## submit here
    acdc = reqMgrClient.submitWorkflow(url, payload)
    if not acdc:
        print "Error in making ACDC for",initial["RequestName"]
        acdc = reqMgrClient.submitWorkflow(url, payload)
        if not acdc:
            print "Error twice in making ACDC for",initial["RequestName"]
            sendLog('actor','Failed twice in making ACDCs for %s!'%initial['RequestName'],level='critical')                
            return None


    ## change splitting if requested
    if actions:
        for action in actions:
            if action.startswith('split'):
                acdcInfo = workflowInfo(url, acdc)
                splittings = acdcInfo.getSplittingsNew(strip=True)
                if actions[action] != 'Same' and actions[action] != 'max':
                    factor = int(actions[action][0:-1]) if 'x' in actions[action] else 2
                    for split in splittings:
                        split_par = split['splitParams']
                        if split['splitAlgo'] in ['EventBased']:
                            sendLog('actor',"Changing the splitting on %s for %s is not permitted. Not changing."%(split['splitAlgo'],initial["RequestName"]), level='critical')
                            continue
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split_par:
                                print "Changing %s (%d) by a factor %d"%( act, split_par[act], factor),
                                split_par[act] /= factor
                                print "to",split_par[act]
                                break
                        #split['requestName'] = acdc
                        #print "changing the splitting of",acdc
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, acdc, split )

                elif 'max' in actions[action]:
                    for split in splittings:
                        split_par = split['splitParams']
                        for act in ['avg_events_per_job','events_per_job','lumis_per_job']:
                            if act in split_par:
                                print "Changing %s (%d) "%( act, split_par[act]),
                                split_par[act] = 1
                                print "to max splitting ",split_par[act]
                                break
                        #split['requestName'] = acdc
                        #print "changing the splitting of",acdc
                        #print json.dumps( split, indent=2 )
                        #print reqMgrClient.setWorkflowSplitting(url, acdc, split )
                print "changing the splitting of",acdc
                print json.dumps( splittings, indent=2 )                
                done = reqMgrClient.setWorkflowSplitting(url, acdc, splittings )
                ## check on done == True
                

    data = reqMgrClient.setWorkflowApproved(url, acdc)
    

    print data
    return acdc